-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor - PacBio metric parser (#3449)
## Description Closes Clinical-Genomics/add-new-tech#65 Refactor the PacBio metric parser. introduce error-catching decorators. --------- Co-authored-by: Sebastian Allard <[email protected]>
- Loading branch information
Showing
8 changed files
with
223 additions
and
153 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,72 +1,39 @@ | ||
from pathlib import Path | ||
from typing import Any, Type | ||
|
||
from cg.constants.constants import FileFormat | ||
from cg.constants.pacbio import PacBioDirsAndFiles | ||
from cg.io.controller import ReadFile | ||
from cg.services.pacbio.metrics.models import ( | ||
BaseMetrics, | ||
ControlMetrics, | ||
HiFiMetrics, | ||
PacBioMetrics, | ||
PolymeraseMetrics, | ||
ProductivityMetrics, | ||
SmrtlinkDatasetsMetrics, | ||
) | ||
from cg.utils.files import get_file_in_directory | ||
from cg.services.pacbio.metrics.utils import ( | ||
parse_control_metrics, | ||
parse_dataset_metrics, | ||
parse_hifi_metrics, | ||
parse_polymerase_metrics, | ||
parse_productivity_metrics, | ||
) | ||
|
||
|
||
class MetricsParser: | ||
class PacBioMetricsParser: | ||
"""Class for parsing PacBio sequencing metrics.""" | ||
|
||
def __init__(self, smrt_cell_path: Path) -> None: | ||
self.smrt_cell_path: Path = smrt_cell_path | ||
self.report_dir = Path(smrt_cell_path, "statistics") | ||
# For HiFi metrics | ||
self.base_calling_report_file: Path = get_file_in_directory( | ||
directory=self.report_dir, file_name=PacBioDirsAndFiles.BASECALLING_REPORT | ||
) | ||
self.hifi_metrics: HiFiMetrics = self.parse_report_to_model( | ||
report_file=self.base_calling_report_file, data_model=HiFiMetrics | ||
) | ||
# For control metrics | ||
self.control_report_file: Path = get_file_in_directory( | ||
directory=self.report_dir, file_name=PacBioDirsAndFiles.CONTROL_REPORT | ||
) | ||
self.control_metrics: ControlMetrics = self.parse_report_to_model( | ||
report_file=self.control_report_file, data_model=ControlMetrics | ||
) | ||
# For productivity metrics | ||
self.loading_report_file: Path = get_file_in_directory( | ||
directory=self.report_dir, file_name=PacBioDirsAndFiles.LOADING_REPORT | ||
) | ||
self.productivity_metrics: ProductivityMetrics = self.parse_report_to_model( | ||
report_file=self.loading_report_file, data_model=ProductivityMetrics | ||
) | ||
# For polymerase metrics | ||
self.raw_data_report_file: Path = get_file_in_directory( | ||
directory=self.report_dir, file_name=PacBioDirsAndFiles.RAW_DATA_REPORT | ||
) | ||
self.polymerase_metrics: PolymeraseMetrics = self.parse_report_to_model( | ||
report_file=self.raw_data_report_file, data_model=PolymeraseMetrics | ||
) | ||
# For SMRTlink datasets metrics | ||
self.smrtlink_datasets_report_file: Path = get_file_in_directory( | ||
directory=self.report_dir, file_name=PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT | ||
) | ||
self.smrtlink_datasets_metrics: SmrtlinkDatasetsMetrics = ( | ||
self.parse_smrtlink_datasets_file() | ||
) | ||
|
||
@staticmethod | ||
def parse_report_to_model(report_file: Path, data_model: Type[BaseMetrics]) -> BaseMetrics: | ||
"""Parse the metrics report to a data model.""" | ||
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file) | ||
metrics: list[dict[str, Any]] = parsed_json.get("attributes") | ||
data: dict = {report_field["id"]: report_field["value"] for report_field in metrics} | ||
return data_model.model_validate(data, from_attributes=True) | ||
def parse_metrics(smrt_cell_path: Path) -> PacBioMetrics: | ||
"""Return all the relevant PacBio metrics parsed in a single Pydantic object.""" | ||
report_dir = Path(smrt_cell_path, "statistics") | ||
hifi_metrics: HiFiMetrics = parse_hifi_metrics(report_dir) | ||
control_metrics: ControlMetrics = parse_control_metrics(report_dir) | ||
productivity_metrics: ProductivityMetrics = parse_productivity_metrics(report_dir) | ||
polymerase_metrics: PolymeraseMetrics = parse_polymerase_metrics(report_dir) | ||
dataset_metrics: SmrtlinkDatasetsMetrics = parse_dataset_metrics(report_dir) | ||
|
||
def parse_smrtlink_datasets_file(self) -> SmrtlinkDatasetsMetrics: | ||
"""Parse the SMRTlink datasets report file.""" | ||
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](self.smrtlink_datasets_report_file) | ||
data: dict = parsed_json[0] | ||
return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True) | ||
return PacBioMetrics( | ||
hifi=hifi_metrics, | ||
control=control_metrics, | ||
productivity=productivity_metrics, | ||
polymerase=polymerase_metrics, | ||
dataset_metrics=dataset_metrics, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from functools import wraps | ||
from pathlib import Path | ||
from typing import Any, Type | ||
|
||
from cg.constants.constants import FileFormat | ||
from cg.constants.pacbio import PacBioDirsAndFiles | ||
from cg.exc import PacBioMetricsParsingError | ||
from cg.io.controller import ReadFile | ||
from cg.services.pacbio.metrics.models import ( | ||
BaseMetrics, | ||
ControlMetrics, | ||
HiFiMetrics, | ||
PolymeraseMetrics, | ||
ProductivityMetrics, | ||
SmrtlinkDatasetsMetrics, | ||
) | ||
from cg.utils.files import get_file_in_directory | ||
|
||
|
||
def handle_pac_bio_parsing_errors(func): | ||
"""Decorator to catch any metrics parsing error to raise a PacBioMetricsParsingError instead.""" | ||
|
||
@wraps(func) | ||
def wrapper(*args, **kwargs): | ||
try: | ||
return func(*args, **kwargs) | ||
except FileNotFoundError as error: | ||
raise PacBioMetricsParsingError(f"Could not find the metrics file: {error}") | ||
except Exception as error: | ||
raise PacBioMetricsParsingError(f"An error occurred while parsing the metrics: {error}") | ||
|
||
return wrapper | ||
|
||
|
||
@handle_pac_bio_parsing_errors | ||
def parse_report_to_model(report_file: Path, data_model: Type[BaseMetrics]) -> BaseMetrics: | ||
"""Parse the metrics report to a data model.""" | ||
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file) | ||
metrics: list[dict[str, Any]] = parsed_json.get("attributes") | ||
data: dict = {report_field["id"]: report_field["value"] for report_field in metrics} | ||
return data_model.model_validate(data, from_attributes=True) | ||
|
||
|
||
@handle_pac_bio_parsing_errors | ||
def parse_dataset_metrics(report_dir: Path) -> SmrtlinkDatasetsMetrics: | ||
file_name = PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT | ||
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name) | ||
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](report) | ||
data: dict = parsed_json[0] | ||
return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True) | ||
|
||
|
||
@handle_pac_bio_parsing_errors | ||
def parse_hifi_metrics(report_dir: Path) -> HiFiMetrics: | ||
file_name = PacBioDirsAndFiles.BASECALLING_REPORT | ||
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name) | ||
return parse_report_to_model(report_file=report, data_model=HiFiMetrics) | ||
|
||
|
||
@handle_pac_bio_parsing_errors | ||
def parse_control_metrics(report_dir: Path) -> ControlMetrics: | ||
file_name = PacBioDirsAndFiles.CONTROL_REPORT | ||
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name) | ||
return parse_report_to_model(report_file=report, data_model=ControlMetrics) | ||
|
||
|
||
@handle_pac_bio_parsing_errors | ||
def parse_productivity_metrics(report_dir: Path) -> ProductivityMetrics: | ||
file_name = PacBioDirsAndFiles.LOADING_REPORT | ||
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name) | ||
return parse_report_to_model(report_file=report, data_model=ProductivityMetrics) | ||
|
||
|
||
@handle_pac_bio_parsing_errors | ||
def parse_polymerase_metrics(report_dir: Path) -> PolymeraseMetrics: | ||
file_name = PacBioDirsAndFiles.RAW_DATA_REPORT | ||
report: Path = get_file_in_directory(directory=report_dir, file_name=file_name) | ||
return parse_report_to_model(report_file=report, data_model=PolymeraseMetrics) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"attributes": [ | ||
{ | ||
"id": "x", | ||
"value": 123 | ||
}, | ||
{ | ||
"id": "y", | ||
"value": "y" | ||
}, | ||
], | ||
} |
Oops, something went wrong.