Skip to content

Commit

Permalink
address comments from PacBio dev branch (#3575)(patch)
Browse files Browse the repository at this point in the history
## Description
Address last comments of #3453 regarding PacBio post-processing
  • Loading branch information
diitaz93 authored Sep 2, 2024
1 parent 7caf7fb commit 320a625
Show file tree
Hide file tree
Showing 11 changed files with 63 additions and 106 deletions.
1 change: 0 additions & 1 deletion cg/cli/post_process/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,4 @@ def post_process_sequencing_run(context: CGConfig, run_name: str, dry_run: bool)
post_processing_service.post_process(run_name=run_name, dry_run=dry_run)


post_process_group: click.Group
post_process_group.add_command(post_process_sequencing_run)
14 changes: 14 additions & 0 deletions cg/services/run_devices/exc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,42 @@


class PostProcessingRunFileManagerError(CgError):
"""Error raised if something goes wrong managing the sequencing run files."""

pass


class PostProcessingRunDataGeneratorError(CgError):
"""Error raised if something goes wrong parsing the run directory data."""

pass


class PostProcessingParsingError(CgError):
"""Error raised if something goes wrong parsing the sequencing run metrics."""

pass


class PostProcessingDataTransferError(CgError):
"""Error raised if something goes wrong creating the DTOs for post-processing."""

pass


class PostProcessingStoreDataError(CgError):
"""Error raised if something goes wrong storing the post-processing data in StatusDB."""

pass


class PostProcessingStoreFileError(CgError):
"""Error raised if something goes wrong storing the post-processing files in Housekeeper."""

pass


class PostProcessingError(CgError):
"""Error raised if something goes wrong during post-processing."""

pass
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
from pathlib import Path

from cg.services.run_devices.abstract_classes import RunDataGenerator
from cg.services.run_devices.error_handler import (
handle_post_processing_errors,
)
from cg.services.run_devices.error_handler import handle_post_processing_errors
from cg.services.run_devices.exc import PostProcessingRunDataGeneratorError
from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData
from cg.services.run_devices.validators import (
validate_has_expected_parts,
validate_name_pre_fix,
)
from cg.utils.string import get_element_from_split
from cg.services.run_devices.validators import validate_has_expected_parts, validate_name_pre_fix


class PacBioRunDataGenerator(RunDataGenerator):
Expand Down Expand Up @@ -40,16 +34,16 @@ def get_run_data(self, run_name: str, sequencing_dir: str) -> PacBioRunData:

@staticmethod
def _get_sequencing_run_name(run_name: str) -> str:
return get_element_from_split(value=run_name, element_position=0, split="/")
return run_name.split("/")[0]

@staticmethod
def _get_plate_well(run_name: str) -> str:
return get_element_from_split(value=run_name, element_position=-1, split="/")
return run_name.split("/")[1]

def _get_plate(self, run_name: str) -> str:
plate_well: str = self._get_plate_well(run_name)
return get_element_from_split(value=plate_well, element_position=0, split="_")
return plate_well.split("_")[0]

def _get_well(self, run_name: str) -> str:
plate_well: str = self._get_plate_well(run_name)
return get_element_from_split(value=plate_well, element_position=-1, split="_")
return plate_well.split("_")[-1]
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,15 @@ def get_files_to_parse(self, run_data: PacBioRunData) -> list[Path]:
"""Get the file paths required by the PacBioMetricsParser."""
run_path: Path = run_data.full_path
validate_files_or_directories_exist([run_path])
files_to_parse: list[Path] = self._get_report_files(run_path)
files_to_parse.append(self._get_ccs_report_file(run_path))
return files_to_parse
return self._get_report_files(run_path)

@handle_post_processing_errors(
to_except=(FileNotFoundError,), to_raise=PostProcessingRunFileManagerError
)
def get_files_to_store(self, run_data: PacBioRunData) -> list[Path]:
"""Get the files to store for the PostProcessingHKService."""
run_path: Path = run_data.full_path
files_to_store: list[Path] = self.get_files_to_parse(run_data)
files_to_store.append(self._get_hifi_read_file(run_path))
return files_to_store
return self.get_files_to_parse(run_data) + self._get_hifi_read_files(run_path)

@staticmethod
def _get_ccs_report_file(run_path: Path) -> Path:
Expand All @@ -44,8 +40,7 @@ def _get_ccs_report_file(run_path: Path) -> Path:
raise FileNotFoundError(f"No CCS report file found in {statistics_dir}")
return files[0]

@staticmethod
def _get_report_files(run_path: Path) -> list[Path]:
def _get_report_files(self, run_path: Path) -> list[Path]:
"""Return the paths to the unzipped report files."""
unzipped_dir: Path = Path(
run_path, PacBioDirsAndFiles.STATISTICS_DIR, PacBioDirsAndFiles.UNZIPPED_REPORTS_DIR
Expand All @@ -55,16 +50,17 @@ def _get_report_files(run_path: Path) -> list[Path]:
Path(unzipped_dir, PacBioDirsAndFiles.LOADING_REPORT),
Path(unzipped_dir, PacBioDirsAndFiles.RAW_DATA_REPORT),
Path(unzipped_dir, PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT),
self._get_ccs_report_file(run_path),
]
validate_files_or_directories_exist(report_files)
return report_files

@staticmethod
def _get_hifi_read_file(run_path: Path) -> Path:
def _get_hifi_read_files(run_path: Path) -> list[Path]:
"""Return the path to the HiFi read file."""
hifi_dir = Path(run_path, PacBioDirsAndFiles.HIFI_READS)
bam_file: Path = get_files_matching_pattern(
bam_files: list[Path] = get_files_matching_pattern(
directory=hifi_dir, pattern=f"*{FileExtensions.BAM}"
)[0]
validate_files_or_directories_exist([bam_file])
return bam_file
)
validate_files_or_directories_exist(bam_files)
return bam_files
10 changes: 0 additions & 10 deletions cg/utils/string.py

This file was deleted.

Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_pac_bio_post_processing_run_name_error(pac_bio_context):
def test_pac_bio_post_processing_store_data_error(
pac_bio_context: CGConfig, pac_bio_sequencing_run_name: str
):
# GIVEN a PacBioPostProcessingService and a wrong run name
# GIVEN a PacBioPostProcessingService that raises an error when storing data in StatusDB

post_processing_service: PacBioPostProcessingService = (
pac_bio_context.post_processing_services.pacbio
Expand All @@ -55,7 +55,7 @@ def test_pac_bio_post_processing_store_data_error(
def test_pac_bio_post_processing_store_files_error(
pac_bio_context: CGConfig, pac_bio_sequencing_run_name: str
):
# GIVEN a PacBioPostProcessingService
# GIVEN a PacBioPostProcessingService that raises an error when storing files in Housekeeper
post_processing_service: PacBioPostProcessingService = (
pac_bio_context.post_processing_services.pacbio
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@


def test_get_run_data(
pac_bio_run_data_generator: PacBioRunDataGenerator,
pac_bio_runs_dir: Path,
pac_bio_test_run_name: str,
pac_bio_smrt_cell_name: str,
Expand All @@ -21,27 +22,26 @@ def test_get_run_data(
run_name: str = "/".join([pac_bio_test_run_name, pac_bio_smrt_cell_name])

# WHEN Generating run data
run_data_generator = PacBioRunDataGenerator()
run_data: PacBioRunData = run_data_generator.get_run_data(
run_data: PacBioRunData = pac_bio_run_data_generator.get_run_data(
run_name=run_name, sequencing_dir=pac_bio_runs_dir.as_posix()
)

# THEN the correct run data are returned
assert run_data == expected_pac_bio_run_data


@pytest.mark.parametrize("run_name", ["rimproper_name", "d_improper_name "])
@pytest.mark.parametrize("wrong_run_name", ["rimproper_name", "d_improper_name "])
def test_get_run_data_improper_name(
pac_bio_run_data_generator: PacBioRunDataGenerator,
pac_bio_runs_dir: Path,
run_name: str,
wrong_run_name: str,
):
# GIVEN a PacBioRunDataGenerator and an improper run name
run_data_generator = PacBioRunDataGenerator()
# GIVEN a PacBioRunDataGenerator and a wrong run name

# WHEN Generating run data
# WHEN Generating run data with the wrong run name

# THEN an PostProcessingRunDataGeneratorError is raised
with pytest.raises(PostProcessingRunDataGeneratorError):
run_data_generator.get_run_data(
run_name=run_name, sequencing_dir=pac_bio_runs_dir.as_posix()
pac_bio_run_data_generator.get_run_data(
run_name=wrong_run_name, sequencing_dir=pac_bio_runs_dir.as_posix()
)
Original file line number Diff line number Diff line change
@@ -1,42 +1,37 @@
from pathlib import Path
from unittest import mock

import pytest
from pathlib import Path

from cg.services.run_devices.exc import PostProcessingRunFileManagerError
from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData
from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import (
PacBioRunFileManager,
)
from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import PacBioRunFileManager


def test_get_files_to_parse(
expected_pac_bio_run_data: PacBioRunData, pac_bio_report_files_to_parse: list[Path]
expected_pac_bio_run_data: PacBioRunData,
pac_bio_report_files_to_parse: list[Path],
pac_bio_run_file_manager: PacBioRunFileManager,
):
# GIVEN a run data object

# GIVEN a PacBio run file manager
file_manager = PacBioRunFileManager()
# GIVEN a run data object and a PacBio run file manager

# WHEN getting the files to parse
files: list[Path] = file_manager.get_files_to_parse(expected_pac_bio_run_data)
files: list[Path] = pac_bio_run_file_manager.get_files_to_parse(expected_pac_bio_run_data)

# THEN the correct files are returned
assert files == pac_bio_report_files_to_parse


def test_get_files_to_store(
expected_pac_bio_run_data: PacBioRunData,
pac_bio_run_file_manager: PacBioRunFileManager,
pac_bio_report_files_to_parse: list[Path],
pac_bio_hifi_read_file: Path,
):
# GIVEN a run data object

# GIVEN a PacBio run file manager
file_manager = PacBioRunFileManager()
# GIVEN a run data object and a PacBio file manager

# WHEN getting the files to store
files: list[Path] = file_manager.get_files_to_store(expected_pac_bio_run_data)
files: list[Path] = pac_bio_run_file_manager.get_files_to_store(expected_pac_bio_run_data)

# THEN the correct files are returned
full_list: list[Path] = pac_bio_report_files_to_parse + [pac_bio_hifi_read_file]
Expand All @@ -45,37 +40,37 @@ def test_get_files_to_store(

def test_get_files_to_store_error(
expected_pac_bio_run_data: PacBioRunData,
pac_bio_run_file_manager: PacBioRunFileManager,
):
# GIVEN a run data object

# GIVEN a PacBio run file manager
file_manager = PacBioRunFileManager()
# GIVEN a PacBio run file manager that can't find the HiFi read file
with mock.patch.object(
file_manager,
attribute="_get_hifi_read_file",
pac_bio_run_file_manager,
attribute="_get_hifi_read_files",
side_effect=FileNotFoundError,
):
# WHEN getting the files to store

# THEN an PostProcessingRunFileManagerError is raised
with pytest.raises(PostProcessingRunFileManagerError):
file_manager.get_files_to_store(expected_pac_bio_run_data)
pac_bio_run_file_manager.get_files_to_store(expected_pac_bio_run_data)


def test_get_files_to_parse_error(
expected_pac_bio_run_data: PacBioRunData,
pac_bio_run_file_manager: PacBioRunFileManager,
):
# GIVEN a run data object

# GIVEN a PacBio run file manager
file_manager = PacBioRunFileManager()
# GIVEN a PacBio run file manager that can't find the CCS report file
with mock.patch.object(
file_manager,
pac_bio_run_file_manager,
attribute="_get_ccs_report_file",
side_effect=FileNotFoundError,
):
# WHEN getting the files to store

# THEN an PostProcessingRunFileManagerError is raised
with pytest.raises(PostProcessingRunFileManagerError):
file_manager.get_files_to_parse(expected_pac_bio_run_data)
pac_bio_run_file_manager.get_files_to_parse(expected_pac_bio_run_data)
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import pytest

from cg.services.run_devices.exc import (
PostProcessingStoreDataError,
PostProcessingDataTransferError,
PostProcessingStoreDataError,
)
from cg.services.run_devices.pacbio.data_storage_service.pacbio_store_service import (
PacBioStoreService,
Expand All @@ -17,7 +17,6 @@
from cg.services.run_devices.pacbio.data_transfer_service.dto import PacBioDTOs
from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData
from cg.store.models import PacBioSampleSequencingMetrics, PacBioSequencingRun, PacBioSMRTCell

from cg.store.store import Store


Expand All @@ -28,7 +27,7 @@ def test_store_post_processing_data(
):
# GIVEN a PacBioStoreService

# GIVEN a successful data transfer service
# GIVEN a data transfer service that returns the correct DTOs

# WHEN storing data for a PacBio instrument run
with mock.patch(
Expand Down Expand Up @@ -67,7 +66,7 @@ def test_store_post_processing_data_error_database(
):
# GIVEN a PacBioStoreService

# GIVEN a successful data transfer service
# GIVEN a data transfer service that returns the correct DTOs

# WHEN storing data for a PacBio instrument run
with mock.patch(
Expand All @@ -85,7 +84,7 @@ def test_store_post_processing_data_error_parser(
):
# GIVEN a PacBioStoreService

# GIVEN a successful data transfer service
# GIVEN a data transfer service that returns the correct DTOs

# WHEN storing data for a PacBio instrument run
with mock.patch(
Expand Down
30 changes: 0 additions & 30 deletions tests/utils/test_string_utils.py

This file was deleted.

0 comments on commit 320a625

Please sign in to comment.