From b3f354e9a5162bd8cb6a598042c0bcdf622542f6 Mon Sep 17 00:00:00 2001 From: Jan Valosek <39456460+valosekj@users.noreply.github.com> Date: Wed, 11 Dec 2024 05:41:29 -0500 Subject: [PATCH] Match GTs and predictions based on BIDS compatible keys (#17) * Match the prediction and reference files based on the participant_id, acq_id, and run_id. * remove python-app.yml from the original MetricsReloaded repo * `get_images_in_folder` --> `get_images` * handle no predictions/GTs * add unittets to test the newly proposed matching based on participant_id, acq_id, and run_id * add clarifying comment * add session-based pairing between GT-pred * fetch chunk id also * fix import after changing function name * update tests with ses_id and chunk_id --------- Co-authored-by: Naga Karthik --- .github/workflows/python-app.yml | 87 ------------ compute_metrics_reloaded.py | 77 +++++++++-- .../test_pairwise_measures_neuropoly.py | 127 +++++++++++++++++- 3 files changed, 191 insertions(+), 100 deletions(-) delete mode 100644 .github/workflows/python-app.yml diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml deleted file mode 100644 index 27cf48d..0000000 --- a/.github/workflows/python-app.yml +++ /dev/null @@ -1,87 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -# This file will define what the workflow consists of, that is what operations we want to perform and when. The first -# part names the action, the second states when the action is triggered (on push or on pull request) and on what -# branches (main and dev in our case). - -name: Unit Tests - -on: - push: - branches: [ main ] # run when anything is pushed to these branches - pull_request: - branches: [ main ] # run for the code submitted as a PR to these branches - -# jobs are a series of steps which run commands in the chosen virtualized environment to perform some action -jobs: - build: - - runs-on: ubuntu-latest # run in Ubuntu VM, so assuming a Unix-like environment for our commands - - steps: - # first step checks out the code into - - uses: actions/checkout@v2 - - # Setup Python using a existing action "actions/setup-python@v2" from Github's library of actions - # Arguments are provided to this action using the key-values under "with" - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - # Install the requirements for this library plus those for running out tests (flake8 and coverage) - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install -r requirements-dev.txt - - # Run flake8 to do basic code quality checks, the output will appear in the action log - - name: Lint with flake8 - run: | - - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics - - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics \ - --per-file-ignores="__init__.py:F401" - - # Run the unit tests using the coverage program and create the XML output file - - name: Test with pytest - run: | - pytest --cov --cov-config=.coveragerc --cov-report=xml -vv - -# # Using Codecov's action, upload the coverage report for the triggering commit/PR -# - name: Upload coverage -# uses: codecov/codecov-action@v2 -# with: -# file: ./coverage.xml -# fail_ci_if_error: true -# verbose: true -# version: "v0.1.15" - - build_docs: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-docs.txt - - - name: Install Metrics Reloaded - run: | - python -m pip install . - - - name: Build docs - run: | - cd docs && make html diff --git a/compute_metrics_reloaded.py b/compute_metrics_reloaded.py index 05494ae..ebebc1a 100644 --- a/compute_metrics_reloaded.py +++ b/compute_metrics_reloaded.py @@ -13,6 +13,7 @@ python compute_metrics_reloaded.py -reference /path/to/reference -prediction /path/to/prediction +NOTE: The prediction and reference files are matched based on the participant_id, acq_id, and run_id. The metrics to be computed can be specified using the `-metrics` argument. For example, to compute only the Dice similarity coefficient (DSC) and Normalized surface distance (NSD), use: @@ -37,6 +38,7 @@ import os +import re import argparse import numpy as np import nibabel as nib @@ -103,9 +105,46 @@ def load_nifti_image(file_path): return nifti_image.get_fdata() -def get_images_in_folder(prediction, reference): +def fetch_bids_compatible_keys(filename_path, prefix='sub-'): """ - Get all files (predictions and references/ground truths) in the input directories + Get participant_id, session_id, acq_id, chunk_id and run_id from the input BIDS-compatible filename or file path + The function works both on absolute file paths as well as filenames + :param filename_path: input nifti filename (e.g., sub-001_ses-01_T1w.nii.gz) or file path + :param prefix: prefix of the participant ID in the filename (default: 'sub-') + (e.g., /home/user/bids/sub-001/ses-01/anat/sub-001_ses-01_T1w.nii.gz + :return: participant_id: participant ID (e.g., sub-001) + :return: session_id: session ID (e.g., ses-01) + :return: acq_id: acquisition ID (e.g., acq-01) + :return: chunk_id: chunk ID (e.g., chunk-1) + :return: run_id: run ID (e.g., run-01) + """ + + participant = re.search(f'{prefix}(.*?)[_/]', filename_path) # [_/] means either underscore or slash + participant_id = participant.group(0)[:-1] if participant else "" # [:-1] removes the last underscore or slash + + session = re.search('ses-(.*?)[_/]', filename_path) # [_/] means either underscore or slash + session_id = session.group(0)[:-1] if session else "" # [:-1] removes the last underscore or slash + + acquisition = re.search('acq-(.*?)[_/]', filename_path) # [_/] means either underscore or slash + acq_id = acquisition.group(0)[:-1] if acquisition else "" # [:-1] removes the last underscore or slash + + chunk = re.search('chunk-(.*?)[_/]', filename_path) # [_/] means either underscore or slash + chunk_id = chunk.group(0)[:-1] if chunk else "" # [:-1] removes the last underscore or slash + + run = re.search('run-(.*?)[_/]', filename_path) # [_/] means either underscore or slash + run_id = run.group(0)[:-1] if run else "" # [:-1] removes the last underscore or slash + + # REGEX explanation + # . - match any character (except newline) + # *? - match the previous element as few times as possible (zero or more times) + + return participant_id, session_id, acq_id, chunk_id, run_id + + +def get_images(prediction, reference): + """ + Get all files (predictions and references/ground truths) in the input directories. + The prediction and reference files are matched based on the participant_id, acq_id, and run_id. :param prediction: path to the directory with prediction files :param reference: path to the directory with reference (ground truth) files :return: list of prediction files, list of reference/ground truth files @@ -113,15 +152,29 @@ def get_images_in_folder(prediction, reference): # Get all files in the directories prediction_files = [os.path.join(prediction, f) for f in os.listdir(prediction) if f.endswith('.nii.gz')] reference_files = [os.path.join(reference, f) for f in os.listdir(reference) if f.endswith('.nii.gz')] - # Check if the number of files in the directories is the same - if len(prediction_files) != len(reference_files): - raise ValueError(f'The number of files in the directories is different. ' - f'Prediction files: {len(prediction_files)}, Reference files: {len(reference_files)}') - print(f'Found {len(prediction_files)} files in the directories.') - # Sort the files - # NOTE: Hopefully, the files are named in the same order in both directories - prediction_files.sort() - reference_files.sort() + + if not prediction_files: + raise FileNotFoundError(f'No prediction files found in {prediction}.') + if not reference_files: + raise FileNotFoundError(f'No reference (ground truths) files found in {reference}.') + + # Create dataframe for prediction_files with participant_id, acq_id, run_id + df_pred = pd.DataFrame(prediction_files, columns=['filename']) + df_pred['participant_id'], df_pred['session_id'], df_pred['acq_id'], df_pred['chunk_id'], df_pred['run_id'] = zip(*df_pred['filename'].apply(fetch_bids_compatible_keys)) + + # Create dataframe for reference_files with participant_id, acq_id, run_id + df_ref = pd.DataFrame(reference_files, columns=['filename']) + df_ref['participant_id'], df_ref['session_id'], df_ref['acq_id'], df_ref['chunk_id'], df_ref['run_id'] = zip(*df_ref['filename'].apply(fetch_bids_compatible_keys)) + + # Merge the two dataframes on participant_id, acq_id, run_id + df = pd.merge(df_pred, df_ref, on=['participant_id', 'session_id', 'acq_id', 'chunk_id', 'run_id'], how='outer', suffixes=('_pred', '_ref')) + # Drop 'participant_id', 'acq_id', 'run_id' + df.drop(['participant_id', 'session_id', 'acq_id', 'chunk_id', 'run_id'], axis=1, inplace=True) + # Drop rows with NaN values. In other words, keep only the rows where both prediction and reference files exist + df.dropna(inplace=True) + + prediction_files = df['filename_pred'].tolist() + reference_files = df['filename_ref'].tolist() return prediction_files, reference_files @@ -236,7 +289,7 @@ def main(): # Args.prediction and args.reference are paths to folders with multiple nii.gz files (i.e., MULTIPLE subjects) if os.path.isdir(args.prediction) and os.path.isdir(args.reference): # Get all files in the directories - prediction_files, reference_files = get_images_in_folder(args.prediction, args.reference) + prediction_files, reference_files = get_images(args.prediction, args.reference) # Use multiprocessing to parallelize the computation with Pool(args.jobs) as pool: diff --git a/test/test_metrics/test_pairwise_measures_neuropoly.py b/test/test_metrics/test_pairwise_measures_neuropoly.py index 60da2be..c82462b 100644 --- a/test/test_metrics/test_pairwise_measures_neuropoly.py +++ b/test/test_metrics/test_pairwise_measures_neuropoly.py @@ -13,7 +13,7 @@ import os import numpy as np import nibabel as nib -from compute_metrics_reloaded import compute_metrics_single_subject +from compute_metrics_reloaded import compute_metrics_single_subject, get_images, fetch_bids_compatible_keys import tempfile METRICS = ['dsc', 'fbeta', 'nsd', 'vol_diff', 'rel_vol_error', 'lesion_ppv', 'lesion_sensitivity', 'lesion_f1_score', @@ -358,6 +358,131 @@ def test_non_empty_ref_and_pred_with_full_overlap(self): # Assert metrics self.assert_metrics(metrics_dict, expected_metrics) +class TestGetImages(unittest.TestCase): + def setUp(self): + """ + Create temporary directories and files for testing. + """ + self.pred_dir = tempfile.TemporaryDirectory() + self.ref_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + """ + Cleanup temporary directories and files after tests. + """ + self.pred_dir.cleanup() + self.ref_dir.cleanup() + + def create_temp_file(self, directory, filename): + """ + Create a temporary file in the given directory with the specified filename. + """ + file_path = os.path.join(directory, filename) + with open(file_path, 'w') as f: + f.write('dummy content') + return file_path + + def test_matching_files(self): + """ + Test matching files based on participant_id, acq_id, and run_id. + """ + self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_acq-01_chunk-1_run-01_pred.nii.gz") + self.create_temp_file(self.ref_dir.name, "sub-01_ses-01_acq-01_chunk-1_run-01_ref.nii.gz") + + pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name) + self.assertEqual(len(pred_files), 1) + self.assertEqual(len(ref_files), 1) + + def test_mismatched_files(self): + """ + Test when no files match based on the criteria. + """ + self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_acq-01_chunk-1_run-01_pred.nii.gz") + self.create_temp_file(self.ref_dir.name, "sub-02_ses-01_acq-02_chunk-1_run-02_ref.nii.gz") + + pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name) + self.assertEqual(len(pred_files), 0) + self.assertEqual(len(ref_files), 0) + + def test_ses_id_empty(self): + """ + Test when ses_id is empty. + """ + self.create_temp_file(self.pred_dir.name, "sub-01_acq-01_chunk-1_run-01_pred.nii.gz") + self.create_temp_file(self.ref_dir.name, "sub-01_acq-01_chunk-1_run-01_ref.nii.gz") + + pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name) + self.assertEqual(len(pred_files), 1) + self.assertEqual(len(ref_files), 1) + self.assertIn("sub-01_acq-01_chunk-1_run-01_pred.nii.gz", pred_files[0]) + self.assertIn("sub-01_acq-01_chunk-1_run-01_ref.nii.gz", ref_files[0]) + + def test_acq_id_empty(self): + """ + Test when acq_id is empty. + """ + self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_chunk-1_run-01_pred.nii.gz") + self.create_temp_file(self.ref_dir.name, "sub-01_ses-01_chunk-1_run-01_ref.nii.gz") + + pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name) + self.assertEqual(len(pred_files), 1) + self.assertEqual(len(ref_files), 1) + self.assertIn("sub-01_ses-01_chunk-1_run-01_pred.nii.gz", pred_files[0]) + self.assertIn("sub-01_ses-01_chunk-1_run-01_ref.nii.gz", ref_files[0]) + + def test_chunk_id_empty(self): + """ + Test when chunk_id is empty in the filenames. + """ + self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_acq-01_run-01_pred.nii.gz") + self.create_temp_file(self.ref_dir.name, "sub-01_ses-01_acq-01_run-01_ref.nii.gz") + + pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name) + + # Assert the matched files + self.assertEqual(len(pred_files), 1) + self.assertEqual(len(ref_files), 1) + self.assertIn("sub-01_ses-01_acq-01_run-01_pred.nii.gz", pred_files[0]) + self.assertIn("sub-01_ses-01_acq-01_run-01_ref.nii.gz", ref_files[0]) + + def test_run_id_empty(self): + """ + Test when run_id is empty in the filenames. + """ + self.create_temp_file(self.pred_dir.name, "sub-01_ses-01_acq-01_chunk-1_pred.nii.gz") + self.create_temp_file(self.ref_dir.name, "sub-01_ses-01_acq-01_chunk-1_ref.nii.gz") + + pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name) + + # Assert the matched files + self.assertEqual(len(pred_files), 1) + self.assertEqual(len(ref_files), 1) + self.assertIn("sub-01_ses-01_acq-01_chunk-1_pred.nii.gz", pred_files[0]) + self.assertIn("sub-01_ses-01_acq-01_chunk-1_ref.nii.gz", ref_files[0]) + + def test_no_files(self): + """ + Test when there are no files in the directories. + Ensure that FileNotFoundError is raised. + """ + with self.assertRaises(FileNotFoundError) as context: + get_images(self.pred_dir.name, self.ref_dir.name) + # Check the exception message + self.assertIn(f'No prediction files found in {self.pred_dir.name}', str(context.exception)) + + def test_partial_matching(self): + """ + Test when some files match and some do not. + """ + self.create_temp_file(self.pred_dir.name, "sub-01_acq-01_run-01_pred.nii.gz") + self.create_temp_file(self.ref_dir.name, "sub-01_acq-01_run-01_ref.nii.gz") + # The following file will not be included in the lists below as there is no matching reference (GT) file + self.create_temp_file(self.pred_dir.name, "sub-02_acq-02_run-02_pred.nii.gz") + + pred_files, ref_files = get_images(self.pred_dir.name, self.ref_dir.name) + self.assertEqual(len(pred_files), 1) + self.assertEqual(len(ref_files), 1) + if __name__ == '__main__': unittest.main()