Skip to content

Commit

Permalink
refactor - PacBio metric parser (#3449)
Browse files Browse the repository at this point in the history
## Description

Closes Clinical-Genomics/add-new-tech#65

Refactor the PacBio metric parser. introduce error-catching decorators.

---------

Co-authored-by: Sebastian Allard <[email protected]>
  • Loading branch information
diitaz93 and seallard authored Jul 22, 2024
1 parent bfbca6d commit 969aed5
Show file tree
Hide file tree
Showing 8 changed files with 223 additions and 153 deletions.
4 changes: 4 additions & 0 deletions cg/exc.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,3 +306,7 @@ class DeliveryMessageNotSupportedError(CgError):

class OverrideCyclesError(CgError):
"""Exception raised when the override cycles are not correct."""


class PacBioMetricsParsingError(CgError):
"""Exception raised when PacBio metric files are not in place."""
81 changes: 24 additions & 57 deletions cg/services/pacbio/metrics/metrics_parser.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,39 @@
from pathlib import Path
from typing import Any, Type

from cg.constants.constants import FileFormat
from cg.constants.pacbio import PacBioDirsAndFiles
from cg.io.controller import ReadFile
from cg.services.pacbio.metrics.models import (
BaseMetrics,
ControlMetrics,
HiFiMetrics,
PacBioMetrics,
PolymeraseMetrics,
ProductivityMetrics,
SmrtlinkDatasetsMetrics,
)
from cg.utils.files import get_file_in_directory
from cg.services.pacbio.metrics.utils import (
parse_control_metrics,
parse_dataset_metrics,
parse_hifi_metrics,
parse_polymerase_metrics,
parse_productivity_metrics,
)


class MetricsParser:
class PacBioMetricsParser:
"""Class for parsing PacBio sequencing metrics."""

def __init__(self, smrt_cell_path: Path) -> None:
self.smrt_cell_path: Path = smrt_cell_path
self.report_dir = Path(smrt_cell_path, "statistics")
# For HiFi metrics
self.base_calling_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.BASECALLING_REPORT
)
self.hifi_metrics: HiFiMetrics = self.parse_report_to_model(
report_file=self.base_calling_report_file, data_model=HiFiMetrics
)
# For control metrics
self.control_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.CONTROL_REPORT
)
self.control_metrics: ControlMetrics = self.parse_report_to_model(
report_file=self.control_report_file, data_model=ControlMetrics
)
# For productivity metrics
self.loading_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.LOADING_REPORT
)
self.productivity_metrics: ProductivityMetrics = self.parse_report_to_model(
report_file=self.loading_report_file, data_model=ProductivityMetrics
)
# For polymerase metrics
self.raw_data_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.RAW_DATA_REPORT
)
self.polymerase_metrics: PolymeraseMetrics = self.parse_report_to_model(
report_file=self.raw_data_report_file, data_model=PolymeraseMetrics
)
# For SMRTlink datasets metrics
self.smrtlink_datasets_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT
)
self.smrtlink_datasets_metrics: SmrtlinkDatasetsMetrics = (
self.parse_smrtlink_datasets_file()
)

@staticmethod
def parse_report_to_model(report_file: Path, data_model: Type[BaseMetrics]) -> BaseMetrics:
"""Parse the metrics report to a data model."""
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file)
metrics: list[dict[str, Any]] = parsed_json.get("attributes")
data: dict = {report_field["id"]: report_field["value"] for report_field in metrics}
return data_model.model_validate(data, from_attributes=True)
def parse_metrics(smrt_cell_path: Path) -> PacBioMetrics:
"""Return all the relevant PacBio metrics parsed in a single Pydantic object."""
report_dir = Path(smrt_cell_path, "statistics")
hifi_metrics: HiFiMetrics = parse_hifi_metrics(report_dir)
control_metrics: ControlMetrics = parse_control_metrics(report_dir)
productivity_metrics: ProductivityMetrics = parse_productivity_metrics(report_dir)
polymerase_metrics: PolymeraseMetrics = parse_polymerase_metrics(report_dir)
dataset_metrics: SmrtlinkDatasetsMetrics = parse_dataset_metrics(report_dir)

def parse_smrtlink_datasets_file(self) -> SmrtlinkDatasetsMetrics:
"""Parse the SMRTlink datasets report file."""
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](self.smrtlink_datasets_report_file)
data: dict = parsed_json[0]
return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True)
return PacBioMetrics(
hifi=hifi_metrics,
control=control_metrics,
productivity=productivity_metrics,
polymerase=polymerase_metrics,
dataset_metrics=dataset_metrics,
)
24 changes: 17 additions & 7 deletions cg/services/pacbio/metrics/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ class ProductivityMetrics(BaseModel):
p_0: int = Field(..., alias=LoadingAttributesIDs.P_0)
p_1: int = Field(..., alias=LoadingAttributesIDs.P_1)
p_2: int = Field(..., alias=LoadingAttributesIDs.P_2)
percentage_p_0: float
percentage_p_1: float
percentage_p_2: float
percent_p_0: float
percent_p_1: float
percent_p_2: float

@model_validator(mode="before")
@classmethod
Expand All @@ -75,9 +75,9 @@ def set_percentages(cls, data: Any):
p_0 = data.get(LoadingAttributesIDs.P_0)
p_1 = data.get(LoadingAttributesIDs.P_1)
p_2 = data.get(LoadingAttributesIDs.P_2)
data["percentage_p_0"] = round((p_0 / productive_zmws) * 100, 0)
data["percentage_p_1"] = round((p_1 / productive_zmws) * 100, 0)
data["percentage_p_2"] = round((p_2 / productive_zmws) * 100, 0)
data["percent_p_0"] = round((p_0 / productive_zmws) * 100, 0)
data["percent_p_1"] = round((p_1 / productive_zmws) * 100, 0)
data["percent_p_2"] = round((p_2 / productive_zmws) * 100, 0)
return data


Expand Down Expand Up @@ -110,7 +110,7 @@ class PolymeraseMetrics(BaseModel):
class SmrtlinkDatasetsMetrics(BaseModel):
"""Model to parse metrics in the SMRTlink datasets report."""

device_internal_id: str = Field(..., alias=SmrtLinkDatabasesIDs.CELL_ID)
cell_id: str = Field(..., alias=SmrtLinkDatabasesIDs.CELL_ID)
well: str = Field(..., alias=SmrtLinkDatabasesIDs.WELL_NAME)
well_sample_name: str = Field(..., alias=SmrtLinkDatabasesIDs.WELL_SAMPLE_NAME)
sample_internal_id: str = Field(..., alias=SmrtLinkDatabasesIDs.BIO_SAMPLE_NAME)
Expand All @@ -130,3 +130,13 @@ def extract_plate(cls, data: Any):
if match:
data["plate"] = match.group(1)
return data


class PacBioMetrics(BaseModel):
"""Model that holds all relevant PacBio metrics."""

hifi: HiFiMetrics
control: ControlMetrics
productivity: ProductivityMetrics
polymerase: PolymeraseMetrics
dataset_metrics: SmrtlinkDatasetsMetrics
78 changes: 78 additions & 0 deletions cg/services/pacbio/metrics/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from functools import wraps
from pathlib import Path
from typing import Any, Type

from cg.constants.constants import FileFormat
from cg.constants.pacbio import PacBioDirsAndFiles
from cg.exc import PacBioMetricsParsingError
from cg.io.controller import ReadFile
from cg.services.pacbio.metrics.models import (
BaseMetrics,
ControlMetrics,
HiFiMetrics,
PolymeraseMetrics,
ProductivityMetrics,
SmrtlinkDatasetsMetrics,
)
from cg.utils.files import get_file_in_directory


def handle_pac_bio_parsing_errors(func):
"""Decorator to catch any metrics parsing error to raise a PacBioMetricsParsingError instead."""

@wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except FileNotFoundError as error:
raise PacBioMetricsParsingError(f"Could not find the metrics file: {error}")
except Exception as error:
raise PacBioMetricsParsingError(f"An error occurred while parsing the metrics: {error}")

return wrapper


@handle_pac_bio_parsing_errors
def parse_report_to_model(report_file: Path, data_model: Type[BaseMetrics]) -> BaseMetrics:
"""Parse the metrics report to a data model."""
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file)
metrics: list[dict[str, Any]] = parsed_json.get("attributes")
data: dict = {report_field["id"]: report_field["value"] for report_field in metrics}
return data_model.model_validate(data, from_attributes=True)


@handle_pac_bio_parsing_errors
def parse_dataset_metrics(report_dir: Path) -> SmrtlinkDatasetsMetrics:
file_name = PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name)
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](report)
data: dict = parsed_json[0]
return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True)


@handle_pac_bio_parsing_errors
def parse_hifi_metrics(report_dir: Path) -> HiFiMetrics:
file_name = PacBioDirsAndFiles.BASECALLING_REPORT
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name)
return parse_report_to_model(report_file=report, data_model=HiFiMetrics)


@handle_pac_bio_parsing_errors
def parse_control_metrics(report_dir: Path) -> ControlMetrics:
file_name = PacBioDirsAndFiles.CONTROL_REPORT
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name)
return parse_report_to_model(report_file=report, data_model=ControlMetrics)


@handle_pac_bio_parsing_errors
def parse_productivity_metrics(report_dir: Path) -> ProductivityMetrics:
file_name = PacBioDirsAndFiles.LOADING_REPORT
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name)
return parse_report_to_model(report_file=report, data_model=ProductivityMetrics)


@handle_pac_bio_parsing_errors
def parse_polymerase_metrics(report_dir: Path) -> PolymeraseMetrics:
file_name = PacBioDirsAndFiles.RAW_DATA_REPORT
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name)
return parse_report_to_model(report_file=report, data_model=PolymeraseMetrics)
35 changes: 9 additions & 26 deletions tests/fixture_plugins/pacbio_fixtures/path_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

import pytest

from cg.constants.pacbio import PacBioDirsAndFiles


# Directory fixtures
@pytest.fixture
Expand All @@ -14,6 +12,12 @@ def pac_bio_fixtures_dir(devices_dir: Path) -> Path:
return Path(devices_dir, "pacbio")


@pytest.fixture
def pac_bio_wrong_metrics_dir(pac_bio_fixtures_dir: Path) -> Path:
"""Return the path to the PacBio metrics directory."""
return Path(pac_bio_fixtures_dir, "wrong_metrics")


@pytest.fixture
def pac_bio_runs_dir(pac_bio_fixtures_dir: Path) -> Path:
"""Return the path to the PacBio run directory."""
Expand All @@ -38,28 +42,7 @@ def pac_bio_run_statistics_dir(pac_bio_smrt_cell_dir: Path) -> Path:
return Path(pac_bio_smrt_cell_dir, "statistics")


# File fixtures


@pytest.fixture
def pac_bio_css_report(pac_bio_run_statistics_dir: Path) -> Path:
"""Return the path to the PacBio CSS report."""
return Path(pac_bio_run_statistics_dir, PacBioDirsAndFiles.BASECALLING_REPORT)


@pytest.fixture
def pac_bio_control_report(pac_bio_run_statistics_dir: Path) -> Path:
"""Return the path to the PacBio control report."""
return Path(pac_bio_run_statistics_dir, PacBioDirsAndFiles.CONTROL_REPORT)


@pytest.fixture
def pac_bio_loading_report(pac_bio_run_statistics_dir: Path) -> Path:
"""Return the path to the PacBio loading report."""
return Path(pac_bio_run_statistics_dir, PacBioDirsAndFiles.LOADING_REPORT)


@pytest.fixture
def pac_bio_raw_data_report(pac_bio_run_statistics_dir: Path) -> Path:
"""Return the path to the PacBio raw data report."""
return Path(pac_bio_run_statistics_dir, PacBioDirsAndFiles.RAW_DATA_REPORT)
def pac_bio_wrong_metrics_file(pac_bio_wrong_metrics_dir: Path) -> Path:
"""Return the path to a temporary PacBio statistics directory."""
return Path(pac_bio_wrong_metrics_dir, "metrics.json")
6 changes: 3 additions & 3 deletions tests/fixture_plugins/pacbio_fixtures/service_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import pytest

from cg.services.pacbio.metrics.metrics_parser import MetricsParser
from cg.services.pacbio.metrics.metrics_parser import PacBioMetricsParser


@pytest.fixture
def pac_bio_metrics_parser(pac_bio_smrt_cell_dir: Path) -> MetricsParser:
def pac_bio_metrics_parser(pac_bio_smrt_cell_dir: Path) -> PacBioMetricsParser:
"""Return a PacBio metrics parser."""
return MetricsParser(pac_bio_smrt_cell_dir)
return PacBioMetricsParser(pac_bio_smrt_cell_dir)
12 changes: 12 additions & 0 deletions tests/fixtures/devices/pacbio/wrong_metrics/metrics.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"attributes": [
{
"id": "x",
"value": 123
},
{
"id": "y",
"value": "y"
},
],
}
Loading

0 comments on commit 969aed5

Please sign in to comment.