address comments from PacBio dev branch (#3575)(patch)

## Description Address last comments of #3453 regarding PacBio post-processing
Clinical-Genomics · Sep 2, 2024 · 320a625 · 320a625
1 parent 7caf7fb
commit 320a625
Show file tree

Hide file tree

Showing 11 changed files with 63 additions and 106 deletions.
diff --git a/cg/cli/post_process/post_process.py b/cg/cli/post_process/post_process.py
@@ -35,5 +35,4 @@ def post_process_sequencing_run(context: CGConfig, run_name: str, dry_run: bool)
     post_processing_service.post_process(run_name=run_name, dry_run=dry_run)
 
 
-post_process_group: click.Group
 post_process_group.add_command(post_process_sequencing_run)
diff --git a/cg/services/run_devices/exc.py b/cg/services/run_devices/exc.py
@@ -2,28 +2,42 @@
 
 
 class PostProcessingRunFileManagerError(CgError):
+    """Error raised if something goes wrong managing the sequencing run files."""
+
     pass
 
 
 class PostProcessingRunDataGeneratorError(CgError):
+    """Error raised if something goes wrong parsing the run directory data."""
+
     pass
 
 
 class PostProcessingParsingError(CgError):
+    """Error raised if something goes wrong parsing the sequencing run metrics."""
+
     pass
 
 
 class PostProcessingDataTransferError(CgError):
+    """Error raised if something goes wrong creating the DTOs for post-processing."""
+
     pass
 
 
 class PostProcessingStoreDataError(CgError):
+    """Error raised if something goes wrong storing the post-processing data in StatusDB."""
+
     pass
 
 
 class PostProcessingStoreFileError(CgError):
+    """Error raised if something goes wrong storing the post-processing files in Housekeeper."""
+
     pass
 
 
 class PostProcessingError(CgError):
+    """Error raised if something goes wrong during post-processing."""
+
     pass
diff --git a/cg/services/run_devices/pacbio/run_data_generator/pacbio_run_data_generator.py b/cg/services/run_devices/pacbio/run_data_generator/pacbio_run_data_generator.py
@@ -1,16 +1,10 @@
 from pathlib import Path
 
 from cg.services.run_devices.abstract_classes import RunDataGenerator
-from cg.services.run_devices.error_handler import (
-    handle_post_processing_errors,
-)
+from cg.services.run_devices.error_handler import handle_post_processing_errors
 from cg.services.run_devices.exc import PostProcessingRunDataGeneratorError
 from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData
-from cg.services.run_devices.validators import (
-    validate_has_expected_parts,
-    validate_name_pre_fix,
-)
-from cg.utils.string import get_element_from_split
+from cg.services.run_devices.validators import validate_has_expected_parts, validate_name_pre_fix
 
 
 class PacBioRunDataGenerator(RunDataGenerator):
@@ -40,16 +34,16 @@ def get_run_data(self, run_name: str, sequencing_dir: str) -> PacBioRunData:
 
     @staticmethod
     def _get_sequencing_run_name(run_name: str) -> str:
-        return get_element_from_split(value=run_name, element_position=0, split="/")
+        return run_name.split("/")[0]
 
     @staticmethod
     def _get_plate_well(run_name: str) -> str:
-        return get_element_from_split(value=run_name, element_position=-1, split="/")
+        return run_name.split("/")[1]
 
     def _get_plate(self, run_name: str) -> str:
         plate_well: str = self._get_plate_well(run_name)
-        return get_element_from_split(value=plate_well, element_position=0, split="_")
+        return plate_well.split("_")[0]
 
     def _get_well(self, run_name: str) -> str:
         plate_well: str = self._get_plate_well(run_name)
-        return get_element_from_split(value=plate_well, element_position=-1, split="_")
+        return plate_well.split("_")[-1]
diff --git a/cg/services/run_devices/pacbio/run_file_manager/run_file_manager.py b/cg/services/run_devices/pacbio/run_file_manager/run_file_manager.py
@@ -19,19 +19,15 @@ def get_files_to_parse(self, run_data: PacBioRunData) -> list[Path]:
         """Get the file paths required by the PacBioMetricsParser."""
         run_path: Path = run_data.full_path
         validate_files_or_directories_exist([run_path])
-        files_to_parse: list[Path] = self._get_report_files(run_path)
-        files_to_parse.append(self._get_ccs_report_file(run_path))
-        return files_to_parse
+        return self._get_report_files(run_path)
 
     @handle_post_processing_errors(
         to_except=(FileNotFoundError,), to_raise=PostProcessingRunFileManagerError
     )
     def get_files_to_store(self, run_data: PacBioRunData) -> list[Path]:
         """Get the files to store for the PostProcessingHKService."""
         run_path: Path = run_data.full_path
-        files_to_store: list[Path] = self.get_files_to_parse(run_data)
-        files_to_store.append(self._get_hifi_read_file(run_path))
-        return files_to_store
+        return self.get_files_to_parse(run_data) + self._get_hifi_read_files(run_path)
 
     @staticmethod
     def _get_ccs_report_file(run_path: Path) -> Path:
@@ -44,8 +40,7 @@ def _get_ccs_report_file(run_path: Path) -> Path:
             raise FileNotFoundError(f"No CCS report file found in {statistics_dir}")
         return files[0]
 
-    @staticmethod
-    def _get_report_files(run_path: Path) -> list[Path]:
+    def _get_report_files(self, run_path: Path) -> list[Path]:
         """Return the paths to the unzipped report files."""
         unzipped_dir: Path = Path(
             run_path, PacBioDirsAndFiles.STATISTICS_DIR, PacBioDirsAndFiles.UNZIPPED_REPORTS_DIR
@@ -55,16 +50,17 @@ def _get_report_files(run_path: Path) -> list[Path]:
             Path(unzipped_dir, PacBioDirsAndFiles.LOADING_REPORT),
             Path(unzipped_dir, PacBioDirsAndFiles.RAW_DATA_REPORT),
             Path(unzipped_dir, PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT),
+            self._get_ccs_report_file(run_path),
         ]
         validate_files_or_directories_exist(report_files)
         return report_files
 
     @staticmethod
-    def _get_hifi_read_file(run_path: Path) -> Path:
+    def _get_hifi_read_files(run_path: Path) -> list[Path]:
         """Return the path to the HiFi read file."""
         hifi_dir = Path(run_path, PacBioDirsAndFiles.HIFI_READS)
-        bam_file: Path = get_files_matching_pattern(
+        bam_files: list[Path] = get_files_matching_pattern(
             directory=hifi_dir, pattern=f"*{FileExtensions.BAM}"
-        )[0]
-        validate_files_or_directories_exist([bam_file])
-        return bam_file
+        )
+        validate_files_or_directories_exist(bam_files)
+        return bam_files
diff --git a/cg/utils/string.py b/cg/utils/string.py
diff --git a/...Tcells/r84202_20240522_133539/1_B01/hifi_reads/m84202_240522_155607_s2.hifi_reads.bam.pbi b/...Tcells/r84202_20240522_133539/1_B01/hifi_reads/m84202_240522_155607_s2.hifi_reads.bam.pbi
diff --git a/tests/services/run_devices/pacbio/post_processing/test_post_processing.py b/tests/services/run_devices/pacbio/post_processing/test_post_processing.py
@@ -37,7 +37,7 @@ def test_pac_bio_post_processing_run_name_error(pac_bio_context):
 def test_pac_bio_post_processing_store_data_error(
     pac_bio_context: CGConfig, pac_bio_sequencing_run_name: str
 ):
-    # GIVEN a PacBioPostProcessingService and a wrong run name
+    # GIVEN a PacBioPostProcessingService that raises an error when storing data in StatusDB
 
     post_processing_service: PacBioPostProcessingService = (
         pac_bio_context.post_processing_services.pacbio
@@ -55,7 +55,7 @@ def test_pac_bio_post_processing_store_data_error(
 def test_pac_bio_post_processing_store_files_error(
     pac_bio_context: CGConfig, pac_bio_sequencing_run_name: str
 ):
-    # GIVEN a PacBioPostProcessingService
+    # GIVEN a PacBioPostProcessingService that raises an error when storing files in Housekeeper
     post_processing_service: PacBioPostProcessingService = (
         pac_bio_context.post_processing_services.pacbio
     )

diff --git a/tests/services/run_devices/pacbio/run_data_generator/test_pacbio_run_data_generator.py b/tests/services/run_devices/pacbio/run_data_generator/test_pacbio_run_data_generator.py
@@ -12,6 +12,7 @@
 
 
 def test_get_run_data(
+    pac_bio_run_data_generator: PacBioRunDataGenerator,
     pac_bio_runs_dir: Path,
     pac_bio_test_run_name: str,
     pac_bio_smrt_cell_name: str,
@@ -21,27 +22,26 @@ def test_get_run_data(
     run_name: str = "/".join([pac_bio_test_run_name, pac_bio_smrt_cell_name])
 
     # WHEN Generating run data
-    run_data_generator = PacBioRunDataGenerator()
-    run_data: PacBioRunData = run_data_generator.get_run_data(
+    run_data: PacBioRunData = pac_bio_run_data_generator.get_run_data(
         run_name=run_name, sequencing_dir=pac_bio_runs_dir.as_posix()
     )
 
     # THEN the correct run data are returned
     assert run_data == expected_pac_bio_run_data
 
 
-@pytest.mark.parametrize("run_name", ["rimproper_name", "d_improper_name "])
+@pytest.mark.parametrize("wrong_run_name", ["rimproper_name", "d_improper_name "])
 def test_get_run_data_improper_name(
+    pac_bio_run_data_generator: PacBioRunDataGenerator,
     pac_bio_runs_dir: Path,
-    run_name: str,
+    wrong_run_name: str,
 ):
-    # GIVEN a PacBioRunDataGenerator and an improper run name
-    run_data_generator = PacBioRunDataGenerator()
+    # GIVEN a PacBioRunDataGenerator and a wrong run name
 
-    # WHEN Generating run data
+    # WHEN Generating run data with the wrong run name
 
     # THEN an PostProcessingRunDataGeneratorError is raised
     with pytest.raises(PostProcessingRunDataGeneratorError):
-        run_data_generator.get_run_data(
-            run_name=run_name, sequencing_dir=pac_bio_runs_dir.as_posix()
+        pac_bio_run_data_generator.get_run_data(
+            run_name=wrong_run_name, sequencing_dir=pac_bio_runs_dir.as_posix()
         )
diff --git a/tests/services/run_devices/pacbio/run_file_manager/test_pacbio_run_file_manager.py b/tests/services/run_devices/pacbio/run_file_manager/test_pacbio_run_file_manager.py
@@ -1,42 +1,37 @@
+from pathlib import Path
 from unittest import mock
 
 import pytest
-from pathlib import Path
 
 from cg.services.run_devices.exc import PostProcessingRunFileManagerError
 from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData
-from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import (
-    PacBioRunFileManager,
-)
+from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import PacBioRunFileManager
 
 
 def test_get_files_to_parse(
-    expected_pac_bio_run_data: PacBioRunData, pac_bio_report_files_to_parse: list[Path]
+    expected_pac_bio_run_data: PacBioRunData,
+    pac_bio_report_files_to_parse: list[Path],
+    pac_bio_run_file_manager: PacBioRunFileManager,
 ):
-    # GIVEN a run data object
-
-    # GIVEN a PacBio run file manager
-    file_manager = PacBioRunFileManager()
+    # GIVEN a run data object and a PacBio run file manager
 
     # WHEN getting the files to parse
-    files: list[Path] = file_manager.get_files_to_parse(expected_pac_bio_run_data)
+    files: list[Path] = pac_bio_run_file_manager.get_files_to_parse(expected_pac_bio_run_data)
 
     # THEN the correct files are returned
     assert files == pac_bio_report_files_to_parse
 
 
 def test_get_files_to_store(
     expected_pac_bio_run_data: PacBioRunData,
+    pac_bio_run_file_manager: PacBioRunFileManager,
     pac_bio_report_files_to_parse: list[Path],
     pac_bio_hifi_read_file: Path,
 ):
-    # GIVEN a run data object
-
-    # GIVEN a PacBio run file manager
-    file_manager = PacBioRunFileManager()
+    # GIVEN a run data object and a PacBio file manager
 
     # WHEN getting the files to store
-    files: list[Path] = file_manager.get_files_to_store(expected_pac_bio_run_data)
+    files: list[Path] = pac_bio_run_file_manager.get_files_to_store(expected_pac_bio_run_data)
 
     # THEN the correct files are returned
     full_list: list[Path] = pac_bio_report_files_to_parse + [pac_bio_hifi_read_file]
@@ -45,37 +40,37 @@ def test_get_files_to_store(
 
 def test_get_files_to_store_error(
     expected_pac_bio_run_data: PacBioRunData,
+    pac_bio_run_file_manager: PacBioRunFileManager,
 ):
     # GIVEN a run data object
 
-    # GIVEN a PacBio run file manager
-    file_manager = PacBioRunFileManager()
+    # GIVEN a PacBio run file manager that can't find the HiFi read file
     with mock.patch.object(
-        file_manager,
-        attribute="_get_hifi_read_file",
+        pac_bio_run_file_manager,
+        attribute="_get_hifi_read_files",
         side_effect=FileNotFoundError,
     ):
         # WHEN getting the files to store
 
         # THEN an PostProcessingRunFileManagerError is raised
         with pytest.raises(PostProcessingRunFileManagerError):
-            file_manager.get_files_to_store(expected_pac_bio_run_data)
+            pac_bio_run_file_manager.get_files_to_store(expected_pac_bio_run_data)
 
 
 def test_get_files_to_parse_error(
     expected_pac_bio_run_data: PacBioRunData,
+    pac_bio_run_file_manager: PacBioRunFileManager,
 ):
     # GIVEN a run data object
 
-    # GIVEN a PacBio run file manager
-    file_manager = PacBioRunFileManager()
+    # GIVEN a PacBio run file manager that can't find the CCS report file
     with mock.patch.object(
-        file_manager,
+        pac_bio_run_file_manager,
         attribute="_get_ccs_report_file",
         side_effect=FileNotFoundError,
     ):
         # WHEN getting the files to store
 
         # THEN an PostProcessingRunFileManagerError is raised
         with pytest.raises(PostProcessingRunFileManagerError):
-            file_manager.get_files_to_parse(expected_pac_bio_run_data)
+            pac_bio_run_file_manager.get_files_to_parse(expected_pac_bio_run_data)
diff --git a/tests/services/run_devices/pacbio/store_service/test_store_service.py b/tests/services/run_devices/pacbio/store_service/test_store_service.py
@@ -5,8 +5,8 @@
 import pytest
 
 from cg.services.run_devices.exc import (
-    PostProcessingStoreDataError,
     PostProcessingDataTransferError,
+    PostProcessingStoreDataError,
 )
 from cg.services.run_devices.pacbio.data_storage_service.pacbio_store_service import (
     PacBioStoreService,
@@ -17,7 +17,6 @@
 from cg.services.run_devices.pacbio.data_transfer_service.dto import PacBioDTOs
 from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData
 from cg.store.models import PacBioSampleSequencingMetrics, PacBioSequencingRun, PacBioSMRTCell
-
 from cg.store.store import Store
 
 
@@ -28,7 +27,7 @@ def test_store_post_processing_data(
 ):
     # GIVEN a PacBioStoreService
 
-    # GIVEN a successful data transfer service
+    # GIVEN a data transfer service that returns the correct DTOs
 
     # WHEN storing data for a PacBio instrument run
     with mock.patch(
@@ -67,7 +66,7 @@ def test_store_post_processing_data_error_database(
 ):
     # GIVEN a PacBioStoreService
 
-    # GIVEN a successful data transfer service
+    # GIVEN a data transfer service that returns the correct DTOs
 
     # WHEN storing data for a PacBio instrument run
     with mock.patch(
@@ -85,7 +84,7 @@ def test_store_post_processing_data_error_parser(
 ):
     # GIVEN a PacBioStoreService
 
-    # GIVEN a successful data transfer service
+    # GIVEN a data transfer service that returns the correct DTOs
 
     # WHEN storing data for a PacBio instrument run
     with mock.patch(

diff --git a/tests/utils/test_string_utils.py b/tests/utils/test_string_utils.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -35,5 +35,4 @@ def post_process_sequencing_run(context: CGConfig, run_name: str, dry_run: bool)
		post_processing_service.post_process(run_name=run_name, dry_run=dry_run)


		post_process_group: click.Group
		post_process_group.add_command(post_process_sequencing_run)