Skip to content

Commit

Permalink
Parse metrics from smrtlnk datasets file (#3436)(patch)
Browse files Browse the repository at this point in the history
## Description
Closes Clinical-Genomics/add-new-tech#63
Parses the final metrics for PacBio. This last metrics file has a structure different from the previous ones. The desired metrics are not inside an `attributes` section in the json, but in the only existing section. Parsing for the other metrics had to be modified to account for the parsing of this file and make it more DRY.

### Added

- Metrics model
- Function to parse specifically the smrtlink-datasets file in the metrics parser
- Constants for metrics names
- Fixture for parsed metrics and fixture file

### Changed

- Merged `_parse_report` and `parse_attributes_to_model` methods of the metrics parser into one, called `parse_report_to_model`.
- Updated tests of these functions

### Fixed

- Implemented `TypeVar` from the `typing` module to better type-hint the different metrics classes, so just typing the parent class (BaseModel from Pydantic)
  • Loading branch information
diitaz93 authored Jul 17, 2024
1 parent ff34ed9 commit 755c712
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 60 deletions.
11 changes: 11 additions & 0 deletions cg/constants/pacbio.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class PacBioDirsAndFiles:
CONTROL_REPORT: str = "control.report.json"
LOADING_REPORT: str = "loading.report.json"
RAW_DATA_REPORT: str = "raw_data.report.json"
SMRTLINK_DATASETS_REPORT: str = "smrtlink-datasets.json"


class CCSAttributeIDs:
Expand Down Expand Up @@ -38,3 +39,13 @@ class PolymeraseDataAttributeIDs:
READ_LENGTH_N50: str = "raw_data_report.read_n50"
MEAN_LONGEST_SUBREAD_LENGTH: str = "raw_data_report.insert_length"
LONGEST_SUBREAD_LENGTH_N50: str = "raw_data_report.insert_n50"


class SmrtLinkDatabasesIDs:
BIO_SAMPLE_NAME: str = "bioSampleName"
CELL_ID: str = "cellId"
CELL_INDEX: str = "cellIndex"
MOVIE_NAME: str = "metadataContextId"
PATH: str = "path"
WELL_NAME: str = "wellName"
WELL_SAMPLE_NAME: str = "wellSampleName"
39 changes: 23 additions & 16 deletions cg/services/pacbio/metrics/metrics_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
from cg.constants.pacbio import PacBioDirsAndFiles
from cg.io.controller import ReadFile
from cg.services.pacbio.metrics.models import (
BaseMetrics,
ControlMetrics,
HiFiMetrics,
PolymeraseMetrics,
ProductivityMetrics,
SmrtlinkDatasetsMetrics,
)
from cg.utils.files import get_file_in_directory

Expand All @@ -23,43 +25,48 @@ def __init__(self, smrt_cell_path: Path) -> None:
self.base_calling_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.BASECALLING_REPORT
)
self.hifi_metrics: HiFiMetrics = self.parse_attributes_to_model(
self.hifi_metrics: HiFiMetrics = self.parse_report_to_model(
report_file=self.base_calling_report_file, data_model=HiFiMetrics
)
# For control metrics
self.control_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.CONTROL_REPORT
)
self.control_metrics: ControlMetrics = self.parse_attributes_to_model(
self.control_metrics: ControlMetrics = self.parse_report_to_model(
report_file=self.control_report_file, data_model=ControlMetrics
)
# For productivity metrics
self.loading_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.LOADING_REPORT
)
self.productivity_metrics: ProductivityMetrics = self.parse_attributes_to_model(
self.productivity_metrics: ProductivityMetrics = self.parse_report_to_model(
report_file=self.loading_report_file, data_model=ProductivityMetrics
)
# For polymerase metrics
self.raw_data_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.RAW_DATA_REPORT
)
self.polymerase_metrics: PolymeraseMetrics = self.parse_attributes_to_model(
self.polymerase_metrics: PolymeraseMetrics = self.parse_report_to_model(
report_file=self.raw_data_report_file, data_model=PolymeraseMetrics
)
# For SMRTlink datasets metrics
self.smrtlink_datasets_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT
)
self.smrtlink_datasets_metrics: SmrtlinkDatasetsMetrics = (
self.parse_smrtlink_datasets_file()
)

@staticmethod
def _parse_report(report_file: Path) -> list[dict[str, Any]]:
"""Parse the attribute element of a PacBio report file in JSON format."""
def parse_report_to_model(report_file: Path, data_model: Type[BaseMetrics]) -> BaseMetrics:
"""Parse the metrics report to a data model."""
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file)
return parsed_json.get("attributes")

def parse_attributes_to_model(
self,
report_file: Path,
data_model: Type[ControlMetrics | HiFiMetrics | PolymeraseMetrics | ProductivityMetrics],
) -> ControlMetrics | HiFiMetrics | PolymeraseMetrics | ProductivityMetrics:
"""Parse the attributes to a model."""
report_content: list[dict[str, Any]] = self._parse_report(report_file=report_file)
data: dict = {report_field["id"]: report_field["value"] for report_field in report_content}
metrics: list[dict[str, Any]] = parsed_json.get("attributes")
data: dict = {report_field["id"]: report_field["value"] for report_field in metrics}
return data_model.model_validate(data, from_attributes=True)

def parse_smrtlink_datasets_file(self) -> SmrtlinkDatasetsMetrics:
"""Parse the SMRTlink datasets report file."""
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](self.smrtlink_datasets_report_file)
data: dict = parsed_json[0]
return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True)
33 changes: 32 additions & 1 deletion cg/services/pacbio/metrics/models.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
from pydantic import BaseModel, Field, field_validator
import re
from typing import Any, TypeVar

from pydantic import BaseModel, Field, field_validator, model_validator

from cg.constants.pacbio import (
CCSAttributeIDs,
ControlAttributeIDs,
LoadingAttributesIDs,
PolymeraseDataAttributeIDs,
SmrtLinkDatabasesIDs,
)
from cg.utils.calculations import divide_by_thousand_with_one_decimal, fraction_to_percent

BaseMetrics = TypeVar("BaseMetrics", bound=BaseModel)


class HiFiMetrics(BaseModel):
"""Model for the HiFi metrics."""
Expand Down Expand Up @@ -101,3 +107,28 @@ class PolymeraseMetrics(BaseModel):
_validate_longest_subread_length_n50 = field_validator(
"longest_subread_length_n50", mode="before"
)(divide_by_thousand_with_one_decimal)


class SmrtlinkDatasetsMetrics(BaseModel):
"""Model to parse metrics in the SMRTlink datasets report."""

device_internal_id: str = Field(..., alias=SmrtLinkDatabasesIDs.CELL_ID)
well: str = Field(..., alias=SmrtLinkDatabasesIDs.WELL_NAME)
well_sample_name: str = Field(..., alias=SmrtLinkDatabasesIDs.WELL_SAMPLE_NAME)
sample_internal_id: str = Field(..., alias=SmrtLinkDatabasesIDs.BIO_SAMPLE_NAME)
movie_name: str = Field(..., alias=SmrtLinkDatabasesIDs.MOVIE_NAME)
cell_index: int = Field(..., alias=SmrtLinkDatabasesIDs.CELL_INDEX)
path: str = Field(..., alias=SmrtLinkDatabasesIDs.PATH)
plate: int

@model_validator(mode="before")
@classmethod
def extract_plate(cls, data: Any):
if isinstance(data, dict):
path = data.get("path")
if path:
pattern = r"/([12])_[ABCD]01"
match = re.search(pattern, path)
if match:
data["plate"] = match.group(1)
return data
16 changes: 16 additions & 0 deletions tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
ControlAttributeIDs,
LoadingAttributesIDs,
PolymeraseDataAttributeIDs,
SmrtLinkDatabasesIDs,
)
from cg.services.pacbio.metrics.models import (
ControlMetrics,
HiFiMetrics,
PolymeraseMetrics,
ProductivityMetrics,
SmrtlinkDatasetsMetrics,
)


Expand Down Expand Up @@ -61,3 +63,17 @@ def pac_bio_polymerase_metrics() -> PolymeraseMetrics:
PolymeraseDataAttributeIDs.LONGEST_SUBREAD_LENGTH_N50: 22250,
}
return PolymeraseMetrics.model_validate(data, from_attributes=True)


@pytest.fixture
def pac_bio_smrtlink_databases_metrics() -> SmrtlinkDatasetsMetrics:
data: dict[str, Any] = {
SmrtLinkDatabasesIDs.BIO_SAMPLE_NAME: "1247014000119",
SmrtLinkDatabasesIDs.CELL_ID: "EA094834",
SmrtLinkDatabasesIDs.CELL_INDEX: 0,
SmrtLinkDatabasesIDs.MOVIE_NAME: "m84202_240522_135641_s1",
SmrtLinkDatabasesIDs.PATH: "/srv/cg_data/pacbio/r84202_20240522_133539/1_A01/pb_formats/m84202_240522_135641_s1.hifi_reads.consensusreadset.xml",
SmrtLinkDatabasesIDs.WELL_NAME: "A01",
SmrtLinkDatabasesIDs.WELL_SAMPLE_NAME: "1247014000119",
}
return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[
{
"id": -1,
"uuid": "a662e714-45f2-4cf3-b8d2-c138e1deebd0",
"name": "1247014000119-Cell1",
"path": "/srv/cg_data/pacbio/r84202_20240522_133539/1_A01/pb_formats/m84202_240522_135641_s1.hifi_reads.consensusreadset.xml",
"createdAt": "2024-05-24T02:21:20.970Z",
"updatedAt": "2024-05-24T02:21:20.970Z",
"importedAt": "2024-05-24T02:21:20.970Z",
"numRecords": 6580977,
"totalLength": 106275091861,
"version": "3.0.1",
"comments": "Record generated by runqc-reports",
"tags": "ccs",
"md5": "b1e427a733653fddbad7c273996637f0",
"instrumentName": "Wilma",
"instrumentControlVersion": "13.0.0.212033",
"metadataContextId": "m84202_240522_135641_s1",
"wellSampleName": "1247014000119",
"wellName": "A01",
"bioSampleName": "1247014000119",
"cellIndex": 0,
"cellId": "EA094834",
"runName": "Run 240515",
"createdBy": "admin",
"jobId": -99,
"projectId": -99,
"isActive": true,
"numChildren": 0,
"numResources": 1,
"datasetType": "PacBio.DataSet.ConsensusReadSet"
}
]
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import math
from pathlib import Path
from typing import Any, Type
from typing import Type

import pytest
from _pytest.fixtures import FixtureRequest

from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
from cg.services.pacbio.metrics.metrics_parser import MetricsParser
from cg.services.pacbio.metrics.models import (
BaseMetrics,
ControlMetrics,
HiFiMetrics,
PolymeraseMetrics,
ProductivityMetrics,
SmrtlinkDatasetsMetrics,
)


Expand All @@ -22,39 +24,12 @@ def test_metrics_parser_initialisation(pac_bio_smrt_cell_dir: Path):
# WHEN initialising the metrics parser
parser = MetricsParser(smrt_cell_path=pac_bio_smrt_cell_dir)

# THEN assert that the parser is initialised with the expected attributes
# THEN the parser is initialised with the expected attributes
assert isinstance(parser.hifi_metrics, HiFiMetrics)
assert isinstance(parser.control_metrics, ControlMetrics)
assert isinstance(parser.productivity_metrics, ProductivityMetrics)
assert isinstance(parser.polymerase_metrics, PolymeraseMetrics)


@pytest.mark.parametrize(
"report_file_path",
[
"pac_bio_control_report",
"pac_bio_css_report",
"pac_bio_loading_report",
"pac_bio_raw_data_report",
],
)
def test_parse_attributes_from_json(
pac_bio_metrics_parser: MetricsParser,
report_file_path: str,
request: FixtureRequest,
):
"""Test the parsing of attributes from any PacBio report file."""
# GIVEN a PacBio report file and a PacBio metrics parser initialised from the SMRTcell path
report_file: Path = request.getfixturevalue(report_file_path)

# WHEN parsing the report file
attributes: list[dict[str, Any]] = pac_bio_metrics_parser._parse_report(report_file=report_file)

# THEN assert that the report attributes are parsed correctly
assert isinstance(attributes, list)
assert isinstance(attributes[0], dict)
assert "id" in attributes[0]
assert "value" in attributes[0]
assert isinstance(parser.smrtlink_datasets_metrics, SmrtlinkDatasetsMetrics)


@pytest.mark.parametrize(
Expand All @@ -75,10 +50,10 @@ def test_parse_attributes_from_json(
],
ids=["Control", "Hi-Fi", "Polymerase", "Productivity"],
)
def test_parse_attributes_to_model(
def test_parse_report_to_model(
pac_bio_metrics_parser: MetricsParser,
report_file_path: str,
model: Type[ControlMetrics | HiFiMetrics | ProductivityMetrics],
model: Type[BaseMetrics],
metrics_fixture: str,
percent_fields: list[str],
request: FixtureRequest,
Expand All @@ -90,27 +65,38 @@ def test_parse_attributes_to_model(
report_file: Path = request.getfixturevalue(report_file_path)

# GIVEN a metrics object with the expected parsed metrics
expected_metrics: ControlMetrics | HiFiMetrics | ProductivityMetrics = request.getfixturevalue(
metrics_fixture
)
expected_metrics: BaseMetrics = request.getfixturevalue(metrics_fixture)

# WHEN parsing the attributes to a given metrics model
parsed_metrics: ControlMetrics | HiFiMetrics | ProductivityMetrics = (
pac_bio_metrics_parser.parse_attributes_to_model(
report_file=report_file,
data_model=model,
)
parsed_metrics: BaseMetrics = pac_bio_metrics_parser.parse_report_to_model(
report_file=report_file, data_model=model
)

# THEN assert that the model attributes are the expected ones
# THEN the model attributes are the expected ones
assert parsed_metrics == expected_metrics

# THEN assert that the percentage fields of the model are not taken as a fraction
# THEN the percentage fields of the model are not taken as a fraction
metrics_dict: dict = parsed_metrics.dict(by_alias=True)
for percent_field in percent_fields:
assert metrics_dict.get(percent_field) > 1


def test_parse_smrtlink_datasets_file(
pac_bio_metrics_parser: MetricsParser,
pac_bio_smrtlink_databases_metrics: SmrtlinkDatasetsMetrics,
):
"""Test to parse the SMRTlink datasets file."""
# GIVEN a metrics parser

# WHEN parsing the SMRTlink datasets file
smrtlink_datasets_metrics: SmrtlinkDatasetsMetrics = (
pac_bio_metrics_parser.parse_smrtlink_datasets_file()
)

# THEN the parsed metrics are the expected ones
assert smrtlink_datasets_metrics == pac_bio_smrtlink_databases_metrics


def test_productivity_metrics_percentage_attributes(
pac_bio_productivity_metrics: ProductivityMetrics,
):
Expand All @@ -122,7 +108,7 @@ def test_productivity_metrics_percentage_attributes(
percentage_p_1: float = pac_bio_productivity_metrics.percentage_p_1
percentage_p_2: float = pac_bio_productivity_metrics.percentage_p_2

# THEN assert that the percentage attributes are calculated correctly
# THEN the percentage attributes are calculated correctly
assert math.isclose(percentage_p_0, 40, abs_tol=1e-9)
assert math.isclose(percentage_p_1, 60, abs_tol=1e-9)
assert math.isclose(percentage_p_2, 0, abs_tol=1e-9)

0 comments on commit 755c712

Please sign in to comment.