Fixed bug in ISBI txt file generation (#20)

* Move `clean_up_annotations` to utils.py to facilitate re-use. * Remove node_key from txt_to_graph * Compare neighbors of G_gt to G_res in `classify_divisions`. * Update `txt_to_graph` to ensure parent_frame combo exists, otherwise, don't add edge. * Use `nx.DiGraph` to better classify error types in `txt_to_graph`. * Fix `contig_tracks` to update both daughter AND parent info for new tracks. * Add `trk_to_isbi` to convert trk files to txt files. * Change assignments from list to tuple to remove numpy warning (Fixes #16) * Add single nodes (with no edges) to graph even though they aren't used. * Add tests for all `isbi_utils` functions
vanvalenlab · Nov 26, 2019 · a79f489 · a79f489
1 parent 2a78147
commit a79f489
Show file tree

Hide file tree

Showing 5 changed files with 390 additions and 138 deletions.
diff --git a/deepcell_tracking/isbi_utils.py b/deepcell_tracking/isbi_utils.py
@@ -36,147 +36,147 @@
 import pandas as pd
 
 
+def trk_to_isbi(track, path):
+    """Convert a lineage track into an ISBI formatted text file.
+
+    Args:
+        track (dict): Cell lineage object.
+        path (str): Path to save the .txt file.
+    """
+    with open(path, 'w') as text_file:
+        for label in track:
+            first_frame = min(track[label]['frames'])
+            last_frame = max(track[label]['frames'])
+            parent = track[label]['parent']
+            parent = 0 if parent is None else parent
+            if parent:
+                parent_frames = track[parent]['frames']
+                if parent_frames[-1] != first_frame - 1:
+                    parent = 0
+
+            line = '{cell_id} {start} {end} {parent}\n'.format(
+                cell_id=label,
+                start=first_frame,
+                end=last_frame,
+                parent=parent
+            )
+
+            text_file.write(line)
+
+
 def contig_tracks(label, batch_info, batch_tracked):
-    """Check for contiguous tracks (tracks should only consist of consecutive tracks).
+    """Check for contiguous tracks (tracks should only consist of consecutive frames).
 
     Split one track into two if neccesary
 
     Args:
         label (int): label of the cell.
-        batch_info (dict): batch info.
-        batch_tracked (dict): batch tracked data.
+        batch_info (dict): a track's lineage info
+        batch_tracked (dict): the new image data associated with the lineage.
 
     Returns:
         tuple(dict, dict): updated batch_info and batch_tracked.
     """
-
     frame_div_missing = False
 
-    original_label = label
-    frames = batch_info[original_label]['frames']
-    final_frame_idx = len(frames) - 1
+    frames = batch_info[label]['frames']
 
-    for frame_idx, frame in enumerate(frames):
+    for i, frame in enumerate(frames):
         # If the next frame is available and contiguous we should move on to
         # the next frame. Otherwise, if the next frame is available and
         # NONcontiguous we should separate this track into two.
-        if frame_idx + 1 <= final_frame_idx and frame + 1 != frames[frame_idx + 1]:
-            contig_end_idx = frame_idx
-
-            next_trk_frames = frames[frame_idx + 1:]
-            daughters = batch_info[original_label]['daughters']
-
-            if 'frame_div' in batch_info[original_label]:
-                frame_div = batch_info[original_label]['frame_div']
-            else:
-                frame_div = None
-                frame_div_missing = True
+        if i + 1 <= len(frames) - 1 and frame + 1 != frames[i + 1]:
+            frame_div = batch_info[label].get('frame_div')
+            if frame_div is None:
+                frame_div_missing = True  # TODO: is this necessary?
 
             # Create a new track to hold the information from this
             # frame forward and add it to the batch.
-            batch_info, batch_tracked = create_new_ISBI_track(
-                batch_tracked, batch_info, original_label,
-                next_trk_frames, daughters, frame_div)
+            new_label = max(batch_info) + 1
+            batch_info[new_label] = {
+                'old_label': label,
+                'label': new_label,
+                'frames': frames[i + 1:],
+                'daughters': batch_info[label]['daughters'],
+                'frame_div': frame_div,
+                'parent': None
+            }
+
+            for d in batch_info[new_label]['daughters']:
+                batch_info[d]['parent'] = new_label
+
+            for f in frames[i + 1:]:
+                batch_tracked[f][batch_tracked[f] == label] = new_label
 
             # Adjust the info of the current track to vacate the new track info
-            batch_info[original_label]['frames'] = frames[0:contig_end_idx + 1]
-            batch_info[original_label]['daughters'] = []
-            batch_info[original_label]['frame_div'] = None
+            batch_info[label]['frames'] = frames[0:i + 1]
+            batch_info[label]['daughters'] = []
+            batch_info[label]['frame_div'] = None
 
-            # Because we are splitting tracks recursively, we stop here
-            break
+            break  # Because we are splitting tracks recursively, we stop here
 
         # If the current frame is the last frame then were done
         # Either the last frame is contiguous and we don't alter batch_info
         # or it's not and it's been made into a new track by the previous
         # iteration of the loop
 
-    if frame_div_missing:
-        print('Warning: frame_div is missing')
-
-    return batch_info, batch_tracked
-
-
-def create_new_ISBI_track(batch_tracked, batch_info, old_label,
-                          frames, daughters, frame_div):
-    """Adds a new track to the lineage and swaps the labels accordingly.
-
-    Args:
-        batch_tracked (dict): tracked data.
-        batch_info (dict): tracked info data.
-        old_label (int): integer label of the tracked cell.
-        frames (list): List of frame numbers in which the cell is present.
-        daughters (list): List of daughter cell IDs.
-        frame_div (int): Frame number in which the cell divides.
-
-    Returns:
-        tuple(dict, dict): updated batch_info and batch_tracked.
-    """
-    new_label = max(batch_info) + 1
-
-    new_track_data = {
-        'old_label': old_label,
-        'label': new_label,
-        'frames': frames,
-        'daughters': daughters,
-        'frame_div': frame_div,
-        'parent': None
-    }
-
-    batch_info[new_label] = new_track_data
-
-    for frame in frames:
-        batch_tracked[frame][batch_tracked[frame] == old_label] = new_label
+        if frame_div_missing:
+            print('Warning: frame_div is missing')
 
     return batch_info, batch_tracked
 
 
-def txt_to_graph(path, node_key=None):
+def txt_to_graph(path):
     """Read the ISBI text file and create a Graph.
 
     Args:
         path (str): Path to the ISBI text file.
-        node_key (str, optional): Key to identify the parent/daughter links.
-            (defaults to Cell_ID+Parent_ID)
 
     Returns:
         networkx.Graph: Graph representation of the text file.
+
+    Raises:
+        ValueError: If the Parent_ID is not in any previous frames.
     """
     names = ['Cell_ID', 'Start', 'End', 'Parent_ID']
     df = pd.read_csv(path, header=None, sep=' ', names=names)
 
-    if node_key is not None:
-        df[['Cell_ID', 'Parent_ID']] = df[['Cell_ID', 'Parent_ID']].replace(
-            node_key)
-
     edges = pd.DataFrame()
 
-    # Add each cell lineage as a set of edges to df
+    all_ids = set()
+    single_nodes = set()
+
+    # Add each continuous cell lineage as a set of edges to df
     for _, row in df.iterrows():
         tpoints = np.arange(row['Start'], row['End'] + 1)
 
-        cellids = ['{cellid}_{frame}'.format(cellid=row['Cell_ID'], frame=t)
-                   for t in tpoints]
+        cellids = ['{}_{}'.format(row['Cell_ID'], t) for t in tpoints]
+
+        if len(cellids) == 1:
+            single_nodes.add(cellids[0])
 
-        source = cellids[0:-1]
-        target = cellids[1:]
+        all_ids.update(cellids)
 
         edges = edges.append(pd.DataFrame({
-            'source': source,
-            'target': target
+            'source': cellids[0:-1],
+            'target': cellids[1:],
         }))
 
     attributes = {}
 
     # Add parent-daughter connections
     for _, row in df[df['Parent_ID'] != 0].iterrows():
-        source = '{cellid}_{frame}'.format(
-            cellid=row['Parent_ID'],
-            frame=row['Start'] - 1)
+        # Assume the parent is in the previous frame.
+        parent_frame = row['Start'] - 1
+        source = '{}_{}'.format(row['Parent_ID'], parent_frame)
 
-        target = '{cellid}_{frame}'.format(
-            cellid=row['Cell_ID'],
-            frame=row['Start'])
+        if source not in all_ids:  # parents should be in the previous frame.
+            # parent_frame = df[df['Cell_ID'] == row['Parent_id']]['End']
+            # source = '{}_{}'.format(row['Parent_ID'], parent_frame)
+            print('%s: skipped parent %s to daughter %s' % (path, source, row['Cell_ID']))
+            continue
+
+        target = '{}_{}'.format(row['Cell_ID'], row['Start'])
 
         edges = edges.append(pd.DataFrame({
             'source': [source],
@@ -186,8 +186,13 @@ def txt_to_graph(path, node_key=None):
         attributes[source] = {'division': True}
 
     # Create graph
-    G = nx.from_pandas_edgelist(edges, source='source', target='target')
+    G = nx.from_pandas_edgelist(edges, source='source', target='target',
+                                create_using=nx.DiGraph)
     nx.set_node_attributes(G, attributes)
+
+    # Add all isolates to graph
+    for cell_id in single_nodes:
+        G.add_node(cell_id)
     return G
 
 
@@ -207,41 +212,49 @@ def classify_divisions(G_gt, G_res):
     div_res = [node for node, d in G_res.nodes(data=True)
                if d.get('division', False)]
 
-    divI = 0   # Correct division
-    divJ = 0   # Wrong division
-    divC = 0   # False positive division
-    divGH = 0  # Missed division
+    correct = 0         # Correct division
+    incorrect = 0       # Wrong division
+    false_positive = 0  # False positive division
+    missed = 0          # Missed division
 
     for node in div_gt:
-        nb_gt = list(G_gt.neighbors(node))
+
+        pred_gt = list(G_gt.pred[node])
+        succ_gt = list(G_gt.succ[node])
 
         # Check if res node was also called a division
         if node in div_res:
-            nb_res = list(G_gt.neighbors(node))
-            # If neighbors are same, then correct division
-            if Counter(nb_gt) == Counter(nb_res):
-                divI += 1
-            # Wrong division
-            elif len(nb_res) == 3:
-                divJ += 1
-            else:
-                divGH += 1
-        # If not called division, then missed division
-        else:
-            divGH += 1
-
-        # Remove processed nodes from res list
-        try:
+            pred_res = list(G_gt.pred[node])
+            succ_res = list(G_res.succ[node])
+
+            # Parents and daughters are the same, perfect!
+            if (Counter(pred_gt) == Counter(pred_res) and
+                    Counter(succ_gt) == Counter(succ_res)):
+                correct += 1
+
+            else:  # what went wrong?
+                incorrect += 1
+                errors = ['out degree = {}'.format(G_res.out_degree(node))]
+                if Counter(succ_gt) != Counter(succ_res):
+                    errors.append('daughters mismatch')
+                if Counter(pred_gt) != Counter(pred_res):
+                    errors.append('parents mismatch')
+                if G_res.out_degree(node) == G_gt.out_degree(node):
+                    errors.append('gt and res degree equal')
+                print(node, '{}.'.format(', '.join(errors)))
+
             div_res.remove(node)
-        except:
-            print('attempted removal of node {} failed'.format(node))
+
+        else:  # valid division not in results, it was missed
+            print('missed node {} division completely'.format(node))
+            missed += 1
 
     # Count any remaining res nodes as false positives
-    divC += len(div_res)
+    false_positive += len(div_res)
 
     return {
-        'Correct division': divI,
-        'Incorrect division': divJ,
-        'False positive division': divC,
-        'False negative division': divGH
+        'Correct division': correct,
+        'Incorrect division': incorrect,
+        'False positive division': false_positive,
+        'False negative division': missed
     }