Parse Pac-Bio metrics from control report (#3409)(patch)

## Description Closes Clinical-Genomics/add-new-tech#45 ### Added - Pydantic model for the control metrics - Constants with the attribute names of the control report file - Unit test for percent function - Fixtures for control report file and metrics object ### Changed - Moved the percent field validator to a separate function to be more DRY - Updated test of metrics parser to work with control model --------- Co-authored-by: ChristianOertlin <[email protected]>
Clinical-Genomics · Jul 8, 2024 · 155a504 · 155a504
1 parent 168d2bf
commit 155a504
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 49 deletions.
diff --git a/cg/constants/pacbio.py b/cg/constants/pacbio.py
@@ -17,3 +17,10 @@ class CCSAttributeIDs:
     READ_LENGTH_N50: str = "ccs2.ccs_readlength_n50"
     MEDIAN_ACCURACY: str = "ccs2.median_accuracy"
     PERCENT_Q30: str = "ccs2.percent_ccs_bases_q30"
+
+
+class ControlAttributeIDs:
+    NUMBER_OF_READS: str = "control.reads_n"
+    MEAN_READ_LENGTH: str = "control.readlength_mean"
+    PERCENT_MEAN_READ_CONCORDANCE: str = "control.concordance_mean"
+    PERCENT_MODE_READ_CONCORDANCE: str = "control.concordance_mode"
diff --git a/cg/services/illumina_services/illumina_metrics_service/bcl_convert_metrics_parser.py b/cg/services/illumina_services/illumina_metrics_service/bcl_convert_metrics_parser.py
@@ -2,10 +2,9 @@
 
 import logging
 from pathlib import Path
-from typing import Callable
+from typing import Type
 
 from cg.apps.demultiplex.sample_sheet.validators import is_valid_sample_internal_id
-
 from cg.constants.constants import SCALE_TO_READ_PAIRS, FileFormat
 from cg.constants.demultiplexing import UNDETERMINED
 from cg.constants.metrics import (
@@ -52,7 +51,7 @@ def __init__(
 
     @staticmethod
     def parse_metrics_file(
-        metrics_file_path, metrics_model: Callable
+        metrics_file_path, metrics_model: Type[SequencingQualityMetrics | DemuxMetrics]
     ) -> list[SequencingQualityMetrics | DemuxMetrics]:
         """Parse specified metrics file."""
         LOG.info(f"Parsing BCLConvert metrics file: {metrics_file_path}")

diff --git a/cg/services/pacbio/metrics/metrics_parser.py b/cg/services/pacbio/metrics/metrics_parser.py
@@ -1,10 +1,10 @@
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any, Type
 
 from cg.constants.constants import FileFormat
 from cg.constants.pacbio import PacBioDirsAndFiles
 from cg.io.controller import ReadFile
-from cg.services.pacbio.metrics.models import HiFiMetrics
+from cg.services.pacbio.metrics.models import ControlMetrics, HiFiMetrics
 from cg.utils.files import get_file_in_directory
 
 
@@ -15,13 +15,19 @@ def __init__(self, smrt_cell_path: Path) -> None:
         self.smrt_cell_path: Path = smrt_cell_path
         self.report_dir = Path(smrt_cell_path, "statistics")
         # For HiFi metrics
-        self.css_report_file: Path = get_file_in_directory(
+        self.base_calling_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.BASECALLING_REPORT
         )
+        self.hifi_metrics: HiFiMetrics = self.parse_attributes_to_model(
+            report_file=self.base_calling_report_file, data_model=HiFiMetrics
+        )
         # For control metrics
         self.control_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.CONTROL_REPORT
         )
+        self.control_metrics: ControlMetrics = self.parse_attributes_to_model(
+            report_file=self.control_report_file, data_model=ControlMetrics
+        )
         # For productivity metrics
         self.loading_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.LOADING_REPORT
@@ -30,18 +36,17 @@ def __init__(self, smrt_cell_path: Path) -> None:
         self.raw_data_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.RAW_DATA_REPORT
         )
-        self.hifi_metrics: HiFiMetrics = self.parse_attributes_to_model(
-            json_file=self.css_report_file, model=HiFiMetrics
-        )
 
     @staticmethod
-    def _parse_report(json_file: Path) -> list[dict[str, Any]]:
-        """Parse the attribute element of a PacBio JSON file."""
-        parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=json_file)
+    def _parse_report(report_file: Path) -> list[dict[str, Any]]:
+        """Parse the attribute element of a PacBio report file in JSON format."""
+        parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file)
         return parsed_json.get("attributes")
 
-    def parse_attributes_to_model(self, json_file: Path, model: Callable) -> HiFiMetrics:
+    def parse_attributes_to_model(
+        self, report_file: Path, data_model: Type[ControlMetrics | HiFiMetrics]
+    ) -> ControlMetrics | HiFiMetrics:
         """Parse the attributes to a model."""
-        report_content: list[dict[str, Any]] = self._parse_report(json_file=json_file)
+        report_content: list[dict[str, Any]] = self._parse_report(report_file=report_file)
         data: dict = {report_field["id"]: report_field["value"] for report_field in report_content}
-        return model(**data)
+        return data_model.model_validate(data, from_attributes=True)
diff --git a/cg/services/pacbio/metrics/models.py b/cg/services/pacbio/metrics/models.py
@@ -1,6 +1,7 @@
 from pydantic import BaseModel, Field, field_validator
 
-from cg.constants.pacbio import CCSAttributeIDs
+from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
+from cg.utils.calculations import fraction_to_percent
 
 
 class HiFiMetrics(BaseModel):
@@ -14,8 +15,25 @@ class HiFiMetrics(BaseModel):
     median_read_quality: str = Field(..., alias=CCSAttributeIDs.MEDIAN_ACCURACY)
     percent_q30: float = Field(..., alias=CCSAttributeIDs.PERCENT_Q30)
 
-    @field_validator("percent_q30", mode="before")
-    def transform_percent_q30(cls, value: float) -> float:
-        if 0.0 <= value <= 1.0:
-            value *= 100
-        return value
+    _validate_percent_q30 = field_validator("percent_q30", mode="before")(fraction_to_percent)
+
+
+class ControlMetrics(BaseModel):
+    """Model for the control metrics."""
+
+    reads: int = Field(..., alias=ControlAttributeIDs.NUMBER_OF_READS)
+    mean_read_length: int = Field(..., alias=ControlAttributeIDs.MEAN_READ_LENGTH)
+    percent_mean_concordance_reads: float = Field(
+        ..., alias=ControlAttributeIDs.PERCENT_MEAN_READ_CONCORDANCE
+    )
+    percent_mode_concordance_reads: float = Field(
+        ..., alias=ControlAttributeIDs.PERCENT_MODE_READ_CONCORDANCE
+    )
+
+    _validate_percent_mean_concordance_reads = field_validator(
+        "percent_mean_concordance_reads", mode="before"
+    )(fraction_to_percent)
+
+    _validate_percent_mode_concordance_reads = field_validator(
+        "percent_mode_concordance_reads", mode="before"
+    )(fraction_to_percent)
diff --git a/cg/utils/calculations.py b/cg/utils/calculations.py
@@ -1,6 +1,13 @@
 """Module to hold functions for calculations."""
 
 
+def fraction_to_percent(value: float) -> float:
+    """Convert a fraction to a percentage."""
+    if 0.0 <= value <= 1.0:
+        value *= 100
+    return value
+
+
 def multiply_by_million(number: float | int) -> int:
     """Multiply a given number by a million."""
     return int(number * 1_000_000)
diff --git a/tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py b/tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py
@@ -1,12 +1,14 @@
+from typing import Any
+
 import pytest
 
-from cg.constants.pacbio import CCSAttributeIDs
-from cg.services.pacbio.metrics.models import HiFiMetrics
+from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
+from cg.services.pacbio.metrics.models import ControlMetrics, HiFiMetrics
 
 
 @pytest.fixture
-def pac_bio_hifi_metrics():
-    data = {
+def pac_bio_hifi_metrics() -> HiFiMetrics:
+    data: dict[str, Any] = {
         CCSAttributeIDs.NUMBER_OF_READS: 6580977,
         CCSAttributeIDs.TOTAL_NUMBER_OF_BASES: 106192944185,
         CCSAttributeIDs.MEAN_READ_LENGTH: 16136,
@@ -15,4 +17,15 @@ def pac_bio_hifi_metrics():
         CCSAttributeIDs.MEDIAN_ACCURACY: "Q34",
         CCSAttributeIDs.PERCENT_Q30: 0.9318790946286002,
     }
-    return HiFiMetrics(**data)
+    return HiFiMetrics.model_validate(data, from_attributes=True)
+
+
+@pytest.fixture
+def pac_bio_control_metrics() -> ControlMetrics:
+    data: dict[str, Any] = {
+        ControlAttributeIDs.NUMBER_OF_READS: 2750,
+        ControlAttributeIDs.MEAN_READ_LENGTH: 57730,
+        ControlAttributeIDs.PERCENT_MEAN_READ_CONCORDANCE: 0.906334,
+        ControlAttributeIDs.PERCENT_MODE_READ_CONCORDANCE: 0.91,
+    }
+    return ControlMetrics.model_validate(data, from_attributes=True)
diff --git a/tests/fixture_plugins/pacbio_fixtures/path_fixtures.py b/tests/fixture_plugins/pacbio_fixtures/path_fixtures.py
@@ -45,3 +45,9 @@ def pac_bio_run_statistics_dir(pac_bio_smrt_cell_dir: Path) -> Path:
 def pac_bio_css_report(pac_bio_run_statistics_dir: Path) -> Path:
     """Return the path to the PacBio CSS report."""
     return Path(pac_bio_run_statistics_dir, PacBioDirsAndFiles.BASECALLING_REPORT)
+
+
+@pytest.fixture
+def pac_bio_control_report(pac_bio_run_statistics_dir: Path) -> Path:
+    """Return the path to the PacBio control report."""
+    return Path(pac_bio_run_statistics_dir, PacBioDirsAndFiles.CONTROL_REPORT)
diff --git a/tests/services/pacbio_services/pacbio_metrics_service/test_pacbio_metrics_service.py b/tests/services/pacbio_services/pacbio_metrics_service/test_pacbio_metrics_service.py
@@ -1,45 +1,91 @@
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable
 
+import pytest
+from _pytest.fixtures import FixtureRequest
+
+from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
 from cg.services.pacbio.metrics.metrics_parser import MetricsParser
-from cg.services.pacbio.metrics.models import HiFiMetrics
+from cg.services.pacbio.metrics.models import ControlMetrics, HiFiMetrics
+
+
+def test_metrics_parser_initialisation(pac_bio_smrt_cell_dir: Path):
+    """Test the initialisation of the metrics parser."""
+    # GIVEN a PacBio SMRT cell path
+
+    # WHEN initialising the metrics parser
+    parser = MetricsParser(smrt_cell_path=pac_bio_smrt_cell_dir)
 
+    # THEN assert that the parser is initialised with the expected attributes
+    assert isinstance(parser.hifi_metrics, HiFiMetrics)
+    assert isinstance(parser.control_metrics, ControlMetrics)
 
+
+@pytest.mark.parametrize(
+    "report_file_path",
+    ["pac_bio_control_report", "pac_bio_css_report"],
+)
 def test_parse_attributes_from_json(
     pac_bio_metrics_parser: MetricsParser,
-    pac_bio_css_report: Path,
+    report_file_path: str,
+    request: FixtureRequest,
 ):
-    """Test to parse the attributes from a PacBio JSON file."""
-    # GIVEN a PacBio JSON file and a PacBio metrics parser initialised from the path
+    """Test the parsing of attributes from any PacBio report file."""
+    # GIVEN a PacBio report file and a PacBio metrics parser initialised from the SMRTcell path
+    report_file: Path = request.getfixturevalue(report_file_path)
 
-    # WHEN parsing the attributes from the JSON file
-    attributes: list[dict[str, Any]] = pac_bio_metrics_parser._parse_report(
-        json_file=pac_bio_css_report
-    )
+    # WHEN parsing the report file
+    attributes: list[dict[str, Any]] = pac_bio_metrics_parser._parse_report(report_file=report_file)
 
-    # THEN assert that the attributes are parsed correctly
+    # THEN assert that the report attributes are parsed correctly
     assert isinstance(attributes, list)
     assert isinstance(attributes[0], dict)
     assert "id" in attributes[0]
     assert "value" in attributes[0]
 
 
+@pytest.mark.parametrize(
+    "report_file_path, model, metrics_fixture, percent_fields",
+    [
+        (
+            "pac_bio_control_report",
+            ControlMetrics,
+            "pac_bio_control_metrics",
+            [
+                ControlAttributeIDs.PERCENT_MEAN_READ_CONCORDANCE,
+                ControlAttributeIDs.PERCENT_MODE_READ_CONCORDANCE,
+            ],
+        ),
+        ("pac_bio_css_report", HiFiMetrics, "pac_bio_hifi_metrics", [CCSAttributeIDs.PERCENT_Q30]),
+    ],
+)
 def test_parse_attributes_to_model(
     pac_bio_metrics_parser: MetricsParser,
-    pac_bio_css_report: Path,
-    pac_bio_hifi_metrics: HiFiMetrics,
+    report_file_path: str,
+    model: Callable,
+    metrics_fixture: str,
+    percent_fields: list[str],
+    request: FixtureRequest,
 ):
-    """Test to parse the attributes to a HiFi model."""
-    # GIVEN a PacBio JSON file
+    """Test to parse the attributes to a metrics model."""
+    # GIVEN a metrics parser
+
+    # GIVEN a pac-bio report file
+    report_file: Path = request.getfixturevalue(report_file_path)
+
+    # GIVEN a metrics object with the expected parsed metrics
+    expected_metrics: ControlMetrics | HiFiMetrics = request.getfixturevalue(metrics_fixture)
 
-    # WHEN parsing the attributes to a model
-    parsed_hifi_metrics = pac_bio_metrics_parser.parse_attributes_to_model(
-        json_file=pac_bio_css_report,
-        model=HiFiMetrics,
+    # WHEN parsing the attributes to a given metrics model
+    parsed_metrics: ControlMetrics | HiFiMetrics = pac_bio_metrics_parser.parse_attributes_to_model(
+        report_file=report_file,
+        data_model=model,
     )
 
-    # THEN assert that the attributes are parsed to a model correctly
-    assert parsed_hifi_metrics == pac_bio_hifi_metrics
+    # THEN assert that the model attributes are the expected ones
+    assert parsed_metrics == expected_metrics
 
-    # THEN assert that the percentage is not taken as a fraction
-    assert parsed_hifi_metrics.percent_q30 > 1
+    # THEN assert that the percentage fields of the model are not taken as a fraction
+    metrics_dict: dict = parsed_metrics.dict(by_alias=True)
+    for percent_field in percent_fields:
+        assert metrics_dict.get(percent_field) > 1
diff --git a/tests/utils/test_calculations.py b/tests/utils/test_calculations.py
@@ -1,6 +1,26 @@
 """Tests for the calculations module."""
 
-from cg.utils.calculations import multiply_by_million
+import pytest
+
+from cg.utils.calculations import fraction_to_percent, multiply_by_million
+
+
+@pytest.mark.parametrize(
+    "fraction, expected",
+    [
+        (0.50, 50.00),
+        (0.001, 0.1),
+        (2, 2),
+    ],
+)
+def test_fraction_to_percent(fraction: float, expected: float):
+    # GIVEN a fraction
+
+    # WHEN converting the fraction to a percentage
+    percentage: float = fraction_to_percent(fraction)
+
+    # THEN the fraction should be converted to a percentage
+    assert percentage == expected
 
 
 def test_multiple_by_a_million():