Skip to content

Commit

Permalink
Parse Pac-Bio metrics from control report (#3409)(patch)
Browse files Browse the repository at this point in the history
## Description

Closes Clinical-Genomics/add-new-tech#45

### Added

- Pydantic model for the control metrics
- Constants with the attribute names of the control report file
- Unit test for percent function
- Fixtures for control report file and metrics object

### Changed

- Moved the percent field validator to a separate function to be more DRY
- Updated test of metrics parser to work with control model
---------

Co-authored-by: ChristianOertlin <[email protected]>
  • Loading branch information
diitaz93 and ChrOertlin authored Jul 8, 2024
1 parent 168d2bf commit 155a504
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 49 deletions.
7 changes: 7 additions & 0 deletions cg/constants/pacbio.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,10 @@ class CCSAttributeIDs:
READ_LENGTH_N50: str = "ccs2.ccs_readlength_n50"
MEDIAN_ACCURACY: str = "ccs2.median_accuracy"
PERCENT_Q30: str = "ccs2.percent_ccs_bases_q30"


class ControlAttributeIDs:
NUMBER_OF_READS: str = "control.reads_n"
MEAN_READ_LENGTH: str = "control.readlength_mean"
PERCENT_MEAN_READ_CONCORDANCE: str = "control.concordance_mean"
PERCENT_MODE_READ_CONCORDANCE: str = "control.concordance_mode"
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@

import logging
from pathlib import Path
from typing import Callable
from typing import Type

from cg.apps.demultiplex.sample_sheet.validators import is_valid_sample_internal_id

from cg.constants.constants import SCALE_TO_READ_PAIRS, FileFormat
from cg.constants.demultiplexing import UNDETERMINED
from cg.constants.metrics import (
Expand Down Expand Up @@ -52,7 +51,7 @@ def __init__(

@staticmethod
def parse_metrics_file(
metrics_file_path, metrics_model: Callable
metrics_file_path, metrics_model: Type[SequencingQualityMetrics | DemuxMetrics]
) -> list[SequencingQualityMetrics | DemuxMetrics]:
"""Parse specified metrics file."""
LOG.info(f"Parsing BCLConvert metrics file: {metrics_file_path}")
Expand Down
29 changes: 17 additions & 12 deletions cg/services/pacbio/metrics/metrics_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from pathlib import Path
from typing import Any, Callable
from typing import Any, Type

from cg.constants.constants import FileFormat
from cg.constants.pacbio import PacBioDirsAndFiles
from cg.io.controller import ReadFile
from cg.services.pacbio.metrics.models import HiFiMetrics
from cg.services.pacbio.metrics.models import ControlMetrics, HiFiMetrics
from cg.utils.files import get_file_in_directory


Expand All @@ -15,13 +15,19 @@ def __init__(self, smrt_cell_path: Path) -> None:
self.smrt_cell_path: Path = smrt_cell_path
self.report_dir = Path(smrt_cell_path, "statistics")
# For HiFi metrics
self.css_report_file: Path = get_file_in_directory(
self.base_calling_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.BASECALLING_REPORT
)
self.hifi_metrics: HiFiMetrics = self.parse_attributes_to_model(
report_file=self.base_calling_report_file, data_model=HiFiMetrics
)
# For control metrics
self.control_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.CONTROL_REPORT
)
self.control_metrics: ControlMetrics = self.parse_attributes_to_model(
report_file=self.control_report_file, data_model=ControlMetrics
)
# For productivity metrics
self.loading_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.LOADING_REPORT
Expand All @@ -30,18 +36,17 @@ def __init__(self, smrt_cell_path: Path) -> None:
self.raw_data_report_file: Path = get_file_in_directory(
directory=self.report_dir, file_name=PacBioDirsAndFiles.RAW_DATA_REPORT
)
self.hifi_metrics: HiFiMetrics = self.parse_attributes_to_model(
json_file=self.css_report_file, model=HiFiMetrics
)

@staticmethod
def _parse_report(json_file: Path) -> list[dict[str, Any]]:
"""Parse the attribute element of a PacBio JSON file."""
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=json_file)
def _parse_report(report_file: Path) -> list[dict[str, Any]]:
"""Parse the attribute element of a PacBio report file in JSON format."""
parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file)
return parsed_json.get("attributes")

def parse_attributes_to_model(self, json_file: Path, model: Callable) -> HiFiMetrics:
def parse_attributes_to_model(
self, report_file: Path, data_model: Type[ControlMetrics | HiFiMetrics]
) -> ControlMetrics | HiFiMetrics:
"""Parse the attributes to a model."""
report_content: list[dict[str, Any]] = self._parse_report(json_file=json_file)
report_content: list[dict[str, Any]] = self._parse_report(report_file=report_file)
data: dict = {report_field["id"]: report_field["value"] for report_field in report_content}
return model(**data)
return data_model.model_validate(data, from_attributes=True)
30 changes: 24 additions & 6 deletions cg/services/pacbio/metrics/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pydantic import BaseModel, Field, field_validator

from cg.constants.pacbio import CCSAttributeIDs
from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
from cg.utils.calculations import fraction_to_percent


class HiFiMetrics(BaseModel):
Expand All @@ -14,8 +15,25 @@ class HiFiMetrics(BaseModel):
median_read_quality: str = Field(..., alias=CCSAttributeIDs.MEDIAN_ACCURACY)
percent_q30: float = Field(..., alias=CCSAttributeIDs.PERCENT_Q30)

@field_validator("percent_q30", mode="before")
def transform_percent_q30(cls, value: float) -> float:
if 0.0 <= value <= 1.0:
value *= 100
return value
_validate_percent_q30 = field_validator("percent_q30", mode="before")(fraction_to_percent)


class ControlMetrics(BaseModel):
"""Model for the control metrics."""

reads: int = Field(..., alias=ControlAttributeIDs.NUMBER_OF_READS)
mean_read_length: int = Field(..., alias=ControlAttributeIDs.MEAN_READ_LENGTH)
percent_mean_concordance_reads: float = Field(
..., alias=ControlAttributeIDs.PERCENT_MEAN_READ_CONCORDANCE
)
percent_mode_concordance_reads: float = Field(
..., alias=ControlAttributeIDs.PERCENT_MODE_READ_CONCORDANCE
)

_validate_percent_mean_concordance_reads = field_validator(
"percent_mean_concordance_reads", mode="before"
)(fraction_to_percent)

_validate_percent_mode_concordance_reads = field_validator(
"percent_mode_concordance_reads", mode="before"
)(fraction_to_percent)
7 changes: 7 additions & 0 deletions cg/utils/calculations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
"""Module to hold functions for calculations."""


def fraction_to_percent(value: float) -> float:
"""Convert a fraction to a percentage."""
if 0.0 <= value <= 1.0:
value *= 100
return value


def multiply_by_million(number: float | int) -> int:
"""Multiply a given number by a million."""
return int(number * 1_000_000)
23 changes: 18 additions & 5 deletions tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import Any

import pytest

from cg.constants.pacbio import CCSAttributeIDs
from cg.services.pacbio.metrics.models import HiFiMetrics
from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
from cg.services.pacbio.metrics.models import ControlMetrics, HiFiMetrics


@pytest.fixture
def pac_bio_hifi_metrics():
data = {
def pac_bio_hifi_metrics() -> HiFiMetrics:
data: dict[str, Any] = {
CCSAttributeIDs.NUMBER_OF_READS: 6580977,
CCSAttributeIDs.TOTAL_NUMBER_OF_BASES: 106192944185,
CCSAttributeIDs.MEAN_READ_LENGTH: 16136,
Expand All @@ -15,4 +17,15 @@ def pac_bio_hifi_metrics():
CCSAttributeIDs.MEDIAN_ACCURACY: "Q34",
CCSAttributeIDs.PERCENT_Q30: 0.9318790946286002,
}
return HiFiMetrics(**data)
return HiFiMetrics.model_validate(data, from_attributes=True)


@pytest.fixture
def pac_bio_control_metrics() -> ControlMetrics:
data: dict[str, Any] = {
ControlAttributeIDs.NUMBER_OF_READS: 2750,
ControlAttributeIDs.MEAN_READ_LENGTH: 57730,
ControlAttributeIDs.PERCENT_MEAN_READ_CONCORDANCE: 0.906334,
ControlAttributeIDs.PERCENT_MODE_READ_CONCORDANCE: 0.91,
}
return ControlMetrics.model_validate(data, from_attributes=True)
6 changes: 6 additions & 0 deletions tests/fixture_plugins/pacbio_fixtures/path_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,9 @@ def pac_bio_run_statistics_dir(pac_bio_smrt_cell_dir: Path) -> Path:
def pac_bio_css_report(pac_bio_run_statistics_dir: Path) -> Path:
"""Return the path to the PacBio CSS report."""
return Path(pac_bio_run_statistics_dir, PacBioDirsAndFiles.BASECALLING_REPORT)


@pytest.fixture
def pac_bio_control_report(pac_bio_run_statistics_dir: Path) -> Path:
"""Return the path to the PacBio control report."""
return Path(pac_bio_run_statistics_dir, PacBioDirsAndFiles.CONTROL_REPORT)
Original file line number Diff line number Diff line change
@@ -1,45 +1,91 @@
from pathlib import Path
from typing import Any
from typing import Any, Callable

import pytest
from _pytest.fixtures import FixtureRequest

from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
from cg.services.pacbio.metrics.metrics_parser import MetricsParser
from cg.services.pacbio.metrics.models import HiFiMetrics
from cg.services.pacbio.metrics.models import ControlMetrics, HiFiMetrics


def test_metrics_parser_initialisation(pac_bio_smrt_cell_dir: Path):
"""Test the initialisation of the metrics parser."""
# GIVEN a PacBio SMRT cell path

# WHEN initialising the metrics parser
parser = MetricsParser(smrt_cell_path=pac_bio_smrt_cell_dir)

# THEN assert that the parser is initialised with the expected attributes
assert isinstance(parser.hifi_metrics, HiFiMetrics)
assert isinstance(parser.control_metrics, ControlMetrics)


@pytest.mark.parametrize(
"report_file_path",
["pac_bio_control_report", "pac_bio_css_report"],
)
def test_parse_attributes_from_json(
pac_bio_metrics_parser: MetricsParser,
pac_bio_css_report: Path,
report_file_path: str,
request: FixtureRequest,
):
"""Test to parse the attributes from a PacBio JSON file."""
# GIVEN a PacBio JSON file and a PacBio metrics parser initialised from the path
"""Test the parsing of attributes from any PacBio report file."""
# GIVEN a PacBio report file and a PacBio metrics parser initialised from the SMRTcell path
report_file: Path = request.getfixturevalue(report_file_path)

# WHEN parsing the attributes from the JSON file
attributes: list[dict[str, Any]] = pac_bio_metrics_parser._parse_report(
json_file=pac_bio_css_report
)
# WHEN parsing the report file
attributes: list[dict[str, Any]] = pac_bio_metrics_parser._parse_report(report_file=report_file)

# THEN assert that the attributes are parsed correctly
# THEN assert that the report attributes are parsed correctly
assert isinstance(attributes, list)
assert isinstance(attributes[0], dict)
assert "id" in attributes[0]
assert "value" in attributes[0]


@pytest.mark.parametrize(
"report_file_path, model, metrics_fixture, percent_fields",
[
(
"pac_bio_control_report",
ControlMetrics,
"pac_bio_control_metrics",
[
ControlAttributeIDs.PERCENT_MEAN_READ_CONCORDANCE,
ControlAttributeIDs.PERCENT_MODE_READ_CONCORDANCE,
],
),
("pac_bio_css_report", HiFiMetrics, "pac_bio_hifi_metrics", [CCSAttributeIDs.PERCENT_Q30]),
],
)
def test_parse_attributes_to_model(
pac_bio_metrics_parser: MetricsParser,
pac_bio_css_report: Path,
pac_bio_hifi_metrics: HiFiMetrics,
report_file_path: str,
model: Callable,
metrics_fixture: str,
percent_fields: list[str],
request: FixtureRequest,
):
"""Test to parse the attributes to a HiFi model."""
# GIVEN a PacBio JSON file
"""Test to parse the attributes to a metrics model."""
# GIVEN a metrics parser

# GIVEN a pac-bio report file
report_file: Path = request.getfixturevalue(report_file_path)

# GIVEN a metrics object with the expected parsed metrics
expected_metrics: ControlMetrics | HiFiMetrics = request.getfixturevalue(metrics_fixture)

# WHEN parsing the attributes to a model
parsed_hifi_metrics = pac_bio_metrics_parser.parse_attributes_to_model(
json_file=pac_bio_css_report,
model=HiFiMetrics,
# WHEN parsing the attributes to a given metrics model
parsed_metrics: ControlMetrics | HiFiMetrics = pac_bio_metrics_parser.parse_attributes_to_model(
report_file=report_file,
data_model=model,
)

# THEN assert that the attributes are parsed to a model correctly
assert parsed_hifi_metrics == pac_bio_hifi_metrics
# THEN assert that the model attributes are the expected ones
assert parsed_metrics == expected_metrics

# THEN assert that the percentage is not taken as a fraction
assert parsed_hifi_metrics.percent_q30 > 1
# THEN assert that the percentage fields of the model are not taken as a fraction
metrics_dict: dict = parsed_metrics.dict(by_alias=True)
for percent_field in percent_fields:
assert metrics_dict.get(percent_field) > 1
22 changes: 21 additions & 1 deletion tests/utils/test_calculations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
"""Tests for the calculations module."""

from cg.utils.calculations import multiply_by_million
import pytest

from cg.utils.calculations import fraction_to_percent, multiply_by_million


@pytest.mark.parametrize(
"fraction, expected",
[
(0.50, 50.00),
(0.001, 0.1),
(2, 2),
],
)
def test_fraction_to_percent(fraction: float, expected: float):
# GIVEN a fraction

# WHEN converting the fraction to a percentage
percentage: float = fraction_to_percent(fraction)

# THEN the fraction should be converted to a percentage
assert percentage == expected


def test_multiple_by_a_million():
Expand Down

0 comments on commit 155a504

Please sign in to comment.