From b3f354e9a5162bd8cb6a598042c0bcdf622542f6 Mon Sep 17 00:00:00 2001
From: Jan Valosek <39456460+valosekj@users.noreply.github.com>
Date: Wed, 11 Dec 2024 05:41:29 -0500
Subject: [PATCH] Match GTs and predictions based on BIDS compatible keys (#17)

* Match the prediction and reference files based on the participant_id, acq_id, and run_id.

* remove python-app.yml from the original MetricsReloaded repo

* `get_images_in_folder` --> `get_images`

* handle no predictions/GTs

* add unittets to test the newly proposed matching based on participant_id, acq_id, and run_id

* add clarifying comment

* add session-based pairing between GT-pred

* fetch chunk id also

* fix import after changing function name

* update tests with ses_id and chunk_id

---------

Co-authored-by: Naga Karthik <emvnagakarthik@gmail.com>
---
 .github/workflows/python-app.yml              |  87 ------------
 compute_metrics_reloaded.py                   |  77 +++++++++--
 .../test_pairwise_measures_neuropoly.py       | 127 +++++++++++++++++-
 3 files changed, 191 insertions(+), 100 deletions(-)
 delete mode 100644 .github/workflows/python-app.yml

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
deleted file mode 100644
index 27cf48d..0000000
--- a/.github/workflows/python-app.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-        
-# This file will define what the workflow consists of, that is what operations we want to perform and when. The first 
-# part names the action, the second states when the action is triggered (on push or on pull request) and on what 
-# branches (main and dev in our case).
-
-name: Unit Tests
-
-on:
-  push:
-    branches: [ main ]  # run when anything is pushed to these branches
-  pull_request:
-    branches: [ main ]  # run for the code submitted as a PR to these branches
-
-# jobs are a series of steps which run commands in the chosen virtualized environment to perform some action
-jobs:
-  build:
-
-    runs-on: ubuntu-latest  # run in Ubuntu VM, so assuming a Unix-like environment for our commands
-
-    steps:
-    # first step checks out the code into 
-    - uses: actions/checkout@v2
-
-    # Setup Python using a existing action "actions/setup-python@v2" from Github's library of actions
-    # Arguments are provided to this action using the key-values under "with"
-    - name: Set up Python 3.9
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.9
-
-    # Install the requirements for this library plus those for running out tests (flake8 and coverage)
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements.txt
-        pip install -r requirements-dev.txt
-
-    # Run flake8 to do basic code quality checks, the output will appear in the action log
-    - name: Lint with flake8
-      run: |
-
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics
-
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics \
-          --per-file-ignores="__init__.py:F401"
-  
-    # Run the unit tests using the coverage program and create the XML output file
-    - name: Test with pytest
-      run: |
-        pytest --cov --cov-config=.coveragerc --cov-report=xml -vv
-
-#    # Using Codecov's action, upload the coverage report for the triggering commit/PR
-#    - name: Upload coverage
-#      uses: codecov/codecov-action@v2
-#      with:
-#        file: ./coverage.xml
-#        fail_ci_if_error: true
-#        verbose: true
-#        version: "v0.1.15"
-  
-  build_docs:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v2
-
-    - name: Set up Python 3.9
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.9
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements-docs.txt
-
-    - name: Install Metrics Reloaded
-      run: |
-        python -m pip install .
-
-    - name: Build docs
-      run: |
-        cd docs && make html
diff --git a/compute_metrics_reloaded.py b/compute_metrics_reloaded.py
index 05494ae..ebebc1a 100644
--- a/compute_metrics_reloaded.py
+++ b/compute_metrics_reloaded.py
@@ -13,6 +13,7 @@
     python compute_metrics_reloaded.py
         -reference /path/to/reference
         -prediction /path/to/prediction
+NOTE: The prediction and reference files are matched based on the participant_id, acq_id, and run_id.
 
 The metrics to be computed can be specified using the `-metrics` argument. For example, to compute only the Dice
 similarity coefficient (DSC) and Normalized surface distance (NSD), use:
@@ -37,6 +38,7 @@
 
 
 import os
+import re
 import argparse
 import numpy as np
 import nibabel as nib
@@ -103,9 +105,46 @@ def load_nifti_image(file_path):
     return nifti_image.get_fdata()
 
 
-def get_images_in_folder(prediction, reference):
+def fetch_bids_compatible_keys(filename_path, prefix='sub-'):
     """
-    Get all files (predictions and references/ground truths) in the input directories
+    Get participant_id, session_id, acq_id, chunk_id and run_id from the input BIDS-compatible filename or file path
+    The function works both on absolute file paths as well as filenames
+    :param filename_path: input nifti filename (e.g., sub-001_ses-01_T1w.nii.gz) or file path
+    :param prefix: prefix of the participant ID in the filename (default: 'sub-')
+    (e.g., /home/user/bids/sub-001/ses-01/anat/sub-001_ses-01_T1w.nii.gz
+    :return: participant_id: participant ID (e.g., sub-001)
+    :return: session_id: session ID (e.g., ses-01)
+    :return: acq_id: acquisition ID (e.g., acq-01)
+    :return: chunk_id: chunk ID (e.g., chunk-1)
+    :return: run_id: run ID (e.g., run-01)
+    """
+
+    participant = re.search(f'{prefix}(.*?)[_/]', filename_path)     # [_/] means either underscore or slash
+    participant_id = participant.group(0)[:-1] if participant else ""   # [:-1] removes the last underscore or slash
+
+    session = re.search('ses-(.*?)[_/]', filename_path)     # [_/] means either underscore or slash
+    session_id = session.group(0)[:-1] if session else ""   # [:-1] removes the last underscore or slash
+
+    acquisition = re.search('acq-(.*?)[_/]', filename_path)     # [_/] means either underscore or slash
+    acq_id = acquisition.group(0)[:-1] if acquisition else ""   # [:-1] removes the last underscore or slash
+
+    chunk = re.search('chunk-(.*?)[_/]', filename_path)     # [_/] means either underscore or slash
+    chunk_id = chunk.group(0)[:-1] if chunk else ""   # [:-1] removes the last underscore or slash
+
+    run = re.search('run-(.*?)[_/]', filename_path)     # [_/] means either underscore or slash
+    run_id = run.group(0)[:-1] if run else ""   # [:-1] removes the last underscore or slash
+
+    # REGEX explanation
+    # . - match any character (except newline)
+    # *? - match the previous element as few times as possible (zero or more times)
+
+    return participant_id, session_id, acq_id, chunk_id, run_id
+
+
+def get_images(prediction, reference):
+    """
+    Get all files (predictions and references/ground truths) in the input directories.
+    The prediction and reference files are matched based on the participant_id, acq_id, and run_id.
     :param prediction: path to the directory with prediction files
     :param reference: path to the directory with reference (ground truth) files
     :return: list of prediction files, list of reference/ground truth files
@@ -113,15 +152,29 @@ def get_images_in_folder(prediction, reference):
     # Get all files in the directories
     prediction_files = [os.path.join(prediction, f) for f in os.listdir(prediction) if f.endswith('.nii.gz')]
     reference_files = [os.path.join(reference, f) for f in os.listdir(reference) if f.endswith('.nii.gz')]
-    # Check if the number of files in the directories is the same
-    if len(prediction_files) != len(reference_files):
-        raise ValueError(f'The number of files in the directories is different. '
-                         f'Prediction files: {len(prediction_files)}, Reference files: {len(reference_files)}')
-    print(f'Found {len(prediction_files)} files in the directories.')
-    # Sort the files
-    # NOTE: Hopefully, the files are named in the same order in both directories
-    prediction_files.sort()
-    reference_files.sort()
+
+    if not prediction_files:
+        raise FileNotFoundError(f'No prediction files found in {prediction}.')
+    if not reference_files:
+        raise FileNotFoundError(f'No reference (ground truths) files found in {reference}.')
+
+    # Create dataframe for prediction_files with participant_id, acq_id, run_id
+    df_pred = pd.DataFrame(prediction_files, columns=['filename'])
+    df_pred['participant_id'], df_pred['session_id'], df_pred['acq_id'], df_pred['chunk_id'], df_pred['run_id'] = zip(*df_pred['filename'].apply(fetch_bids_compatible_keys))
+
+    # Create dataframe for reference_files with participant_id, acq_id, run_id
+    df_ref = pd.DataFrame(reference_files, columns=['filename'])
+    df_ref['participant_id'], df_ref['session_id'], df_ref['acq_id'], df_ref['chunk_id'], df_ref['run_id'] = zip(*df_ref['filename'].apply(fetch_bids_compatible_keys))
+
+    # Merge the two dataframes on participant_id, acq_id, run_id
+    df = pd.merge(df_pred, df_ref, on=['participant_id', 'session_id', 'acq_id', 'chunk_id', 'run_id'], how='outer', suffixes=('_pred', '_ref'))
+    # Drop 'participant_id', 'acq_id', 'run_id'
+    df.drop(['participant_id', 'session_id', 'acq_id', 'chunk_id', 'run_id'], axis=1, inplace=True)
+    # Drop rows with NaN values. In other words, keep only the rows where both prediction and reference files exist
+    df.dropna(inplace=True)
+
+    prediction_files = df['filename_pred'].tolist()
+    reference_files = df['filename_ref'].tolist()
 
     return prediction_files, reference_files
 
@@ -236,7 +289,7 @@ def main():
     # Args.prediction and args.reference are paths to folders with multiple nii.gz files (i.e., MULTIPLE subjects)
     if os.path.isdir(args.prediction) and os.path.isdir(args.reference):
         # Get all files in the directories
-        prediction_files, reference_files = get_images_in_folder(args.prediction, args.reference)
+        prediction_files, reference_files = get_images(args.prediction, args.reference)
 
         # Use multiprocessing to parallelize the computation
         with Pool(args.jobs) as pool:
diff --git a/test/test_metrics/test_pairwise_measures_neuropoly.py b/test/test_metrics/test_pairwise_measures_neuropoly.py
index 60da2be..c82462b 100644
--- a/test/test_metrics/test_pairwise_measures_neuropoly.py
+++ b/test/test_metrics/test_pairwise_measures_neuropoly.py
@@ -13,7 +13,7 @@
 import os
 import numpy as np
 import nibabel as nib
-from compute_metrics_reloaded import compute_metrics_single_subject
+from compute_metrics_reloaded import compute_metrics_single_subject, get_images, fetch_bids_compatible_keys
 import tempfile
 
 METRICS = ['dsc', 'fbeta', 'nsd', 'vol_diff', 'rel_vol_error', 'lesion_ppv', 'lesion_sensitivity', 'lesion_f1_score',
@@ -358,6 +358,131 @@ def test_non_empty_ref_and_pred_with_full_overlap(self):
         # Assert metrics
         self.assert_metrics(metrics_dict, expected_metrics)
 
+class TestGetImages(unittest.TestCase):
+    def setUp(self):
+        """
+        Create temporary directories and files for testing.
+        """
+        self.pred_dir = tempfile.TemporaryDirectory()
+        self.ref_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        """
+        Cleanup temporary directories and files after tests.
+        """
+        self.pred_dir.cleanup()
+        self.ref_dir.cleanup()
+
+    def create_temp_file(self, directory, filename):
+        """
+        Create a temporary file in the given directory with the specified filename.
+        """
+        file_path = os.path.join(directory, filename)
+        with open(file_path, 'w') as f:
+            f.write('dummy content')
+        return file_path
+
+    def test_matching_files(self):
+        """
+        Test matching files based on participant_id, acq_id, and run_id.
+        """
+        self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_acq-01_chunk-1_run-01_pred.nii.gz")
+        self.create_temp_file(self.ref_dir.name, "sub-01_ses-01_acq-01_chunk-1_run-01_ref.nii.gz")
+
+        pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name)
+        self.assertEqual(len(pred_files), 1)
+        self.assertEqual(len(ref_files), 1)
+
+    def test_mismatched_files(self):
+        """
+        Test when no files match based on the criteria.
+        """
+        self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_acq-01_chunk-1_run-01_pred.nii.gz")
+        self.create_temp_file(self.ref_dir.name, "sub-02_ses-01_acq-02_chunk-1_run-02_ref.nii.gz")
+
+        pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name)
+        self.assertEqual(len(pred_files), 0)
+        self.assertEqual(len(ref_files), 0)
+
+    def test_ses_id_empty(self):
+        """
+        Test when ses_id is empty.
+        """
+        self.create_temp_file(self.pred_dir.name, "sub-01_acq-01_chunk-1_run-01_pred.nii.gz")
+        self.create_temp_file(self.ref_dir.name, "sub-01_acq-01_chunk-1_run-01_ref.nii.gz")
+
+        pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name)
+        self.assertEqual(len(pred_files), 1)
+        self.assertEqual(len(ref_files), 1)
+        self.assertIn("sub-01_acq-01_chunk-1_run-01_pred.nii.gz", pred_files[0])
+        self.assertIn("sub-01_acq-01_chunk-1_run-01_ref.nii.gz", ref_files[0])
+
+    def test_acq_id_empty(self):
+        """
+        Test when acq_id is empty.
+        """
+        self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_chunk-1_run-01_pred.nii.gz")
+        self.create_temp_file(self.ref_dir.name, "sub-01_ses-01_chunk-1_run-01_ref.nii.gz")
+
+        pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name)
+        self.assertEqual(len(pred_files), 1)
+        self.assertEqual(len(ref_files), 1)
+        self.assertIn("sub-01_ses-01_chunk-1_run-01_pred.nii.gz", pred_files[0])
+        self.assertIn("sub-01_ses-01_chunk-1_run-01_ref.nii.gz", ref_files[0])
+
+    def test_chunk_id_empty(self):
+        """
+        Test when chunk_id is empty in the filenames.
+        """
+        self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_acq-01_run-01_pred.nii.gz")
+        self.create_temp_file(self.ref_dir.name, "sub-01_ses-01_acq-01_run-01_ref.nii.gz")
+
+        pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name)
+
+        # Assert the matched files
+        self.assertEqual(len(pred_files), 1)
+        self.assertEqual(len(ref_files), 1)
+        self.assertIn("sub-01_ses-01_acq-01_run-01_pred.nii.gz", pred_files[0])
+        self.assertIn("sub-01_ses-01_acq-01_run-01_ref.nii.gz", ref_files[0])
+
+    def test_run_id_empty(self):
+        """
+        Test when run_id is empty in the filenames.
+        """
+        self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_acq-01_chunk-1_pred.nii.gz")
+        self.create_temp_file(self.ref_dir.name, "sub-01_ses-01_acq-01_chunk-1_ref.nii.gz")
+
+        pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name)
+
+        # Assert the matched files
+        self.assertEqual(len(pred_files), 1)
+        self.assertEqual(len(ref_files), 1)
+        self.assertIn("sub-01_ses-01_acq-01_chunk-1_pred.nii.gz", pred_files[0])
+        self.assertIn("sub-01_ses-01_acq-01_chunk-1_ref.nii.gz", ref_files[0])
+
+    def test_no_files(self):
+        """
+        Test when there are no files in the directories.
+        Ensure that FileNotFoundError is raised.
+        """
+        with self.assertRaises(FileNotFoundError) as context:
+            get_images(self.pred_dir.name, self.ref_dir.name)
+        # Check the exception message
+        self.assertIn(f'No prediction files found in {self.pred_dir.name}', str(context.exception))
+
+    def test_partial_matching(self):
+        """
+        Test when some files match and some do not.
+        """
+        self.create_temp_file(self.pred_dir.name, "sub-01_acq-01_run-01_pred.nii.gz")
+        self.create_temp_file(self.ref_dir.name, "sub-01_acq-01_run-01_ref.nii.gz")
+        # The following file will not be included in the lists below as there is no matching reference (GT) file
+        self.create_temp_file(self.pred_dir.name, "sub-02_acq-02_run-02_pred.nii.gz")
+
+        pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name)
+        self.assertEqual(len(pred_files), 1)
+        self.assertEqual(len(ref_files), 1)
+
 
 if __name__ == '__main__':
     unittest.main()