Parse metrics from smrtlnk datasets file (#3436)(patch)

## Description Closes Clinical-Genomics/add-new-tech#63 Parses the final metrics for PacBio. This last metrics file has a structure different from the previous ones. The desired metrics are not inside an `attributes` section in the json, but in the only existing section. Parsing for the other metrics had to be modified to account for the parsing of this file and make it more DRY. ### Added - Metrics model - Function to parse specifically the smrtlink-datasets file in the metrics parser - Constants for metrics names - Fixture for parsed metrics and fixture file ### Changed - Merged `_parse_report` and `parse_attributes_to_model` methods of the metrics parser into one, called `parse_report_to_model`. - Updated tests of these functions ### Fixed - Implemented `TypeVar` from the `typing` module to better type-hint the different metrics classes, so just typing the parent class (BaseModel from Pydantic)
Clinical-Genomics · Jul 17, 2024 · 755c712 · 755c712
1 parent ff34ed9
commit 755c712
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 60 deletions.
diff --git a/cg/constants/pacbio.py b/cg/constants/pacbio.py
@@ -7,6 +7,7 @@ class PacBioDirsAndFiles:
     CONTROL_REPORT: str = "control.report.json"
     LOADING_REPORT: str = "loading.report.json"
     RAW_DATA_REPORT: str = "raw_data.report.json"
+    SMRTLINK_DATASETS_REPORT: str = "smrtlink-datasets.json"
 
 
 class CCSAttributeIDs:
@@ -38,3 +39,13 @@ class PolymeraseDataAttributeIDs:
     READ_LENGTH_N50: str = "raw_data_report.read_n50"
     MEAN_LONGEST_SUBREAD_LENGTH: str = "raw_data_report.insert_length"
     LONGEST_SUBREAD_LENGTH_N50: str = "raw_data_report.insert_n50"
+
+
+class SmrtLinkDatabasesIDs:
+    BIO_SAMPLE_NAME: str = "bioSampleName"
+    CELL_ID: str = "cellId"
+    CELL_INDEX: str = "cellIndex"
+    MOVIE_NAME: str = "metadataContextId"
+    PATH: str = "path"
+    WELL_NAME: str = "wellName"
+    WELL_SAMPLE_NAME: str = "wellSampleName"
diff --git a/cg/services/pacbio/metrics/metrics_parser.py b/cg/services/pacbio/metrics/metrics_parser.py
@@ -5,10 +5,12 @@
 from cg.constants.pacbio import PacBioDirsAndFiles
 from cg.io.controller import ReadFile
 from cg.services.pacbio.metrics.models import (
+    BaseMetrics,
     ControlMetrics,
     HiFiMetrics,
     PolymeraseMetrics,
     ProductivityMetrics,
+    SmrtlinkDatasetsMetrics,
 )
 from cg.utils.files import get_file_in_directory
 
@@ -23,43 +25,48 @@ def __init__(self, smrt_cell_path: Path) -> None:
         self.base_calling_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.BASECALLING_REPORT
         )
-        self.hifi_metrics: HiFiMetrics = self.parse_attributes_to_model(
+        self.hifi_metrics: HiFiMetrics = self.parse_report_to_model(
             report_file=self.base_calling_report_file, data_model=HiFiMetrics
         )
         # For control metrics
         self.control_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.CONTROL_REPORT
         )
-        self.control_metrics: ControlMetrics = self.parse_attributes_to_model(
+        self.control_metrics: ControlMetrics = self.parse_report_to_model(
             report_file=self.control_report_file, data_model=ControlMetrics
         )
         # For productivity metrics
         self.loading_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.LOADING_REPORT
         )
-        self.productivity_metrics: ProductivityMetrics = self.parse_attributes_to_model(
+        self.productivity_metrics: ProductivityMetrics = self.parse_report_to_model(
             report_file=self.loading_report_file, data_model=ProductivityMetrics
         )
         # For polymerase metrics
         self.raw_data_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.RAW_DATA_REPORT
         )
-        self.polymerase_metrics: PolymeraseMetrics = self.parse_attributes_to_model(
+        self.polymerase_metrics: PolymeraseMetrics = self.parse_report_to_model(
             report_file=self.raw_data_report_file, data_model=PolymeraseMetrics
         )
+        # For SMRTlink datasets metrics
+        self.smrtlink_datasets_report_file: Path = get_file_in_directory(
+            directory=self.report_dir, file_name=PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT
+        )
+        self.smrtlink_datasets_metrics: SmrtlinkDatasetsMetrics = (
+            self.parse_smrtlink_datasets_file()
+        )
 
     @staticmethod
-    def _parse_report(report_file: Path) -> list[dict[str, Any]]:
-        """Parse the attribute element of a PacBio report file in JSON format."""
+    def parse_report_to_model(report_file: Path, data_model: Type[BaseMetrics]) -> BaseMetrics:
+        """Parse the metrics report to a data model."""
         parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file)
-        return parsed_json.get("attributes")
-
-    def parse_attributes_to_model(
-        self,
-        report_file: Path,
-        data_model: Type[ControlMetrics | HiFiMetrics | PolymeraseMetrics | ProductivityMetrics],
-    ) -> ControlMetrics | HiFiMetrics | PolymeraseMetrics | ProductivityMetrics:
-        """Parse the attributes to a model."""
-        report_content: list[dict[str, Any]] = self._parse_report(report_file=report_file)
-        data: dict = {report_field["id"]: report_field["value"] for report_field in report_content}
+        metrics: list[dict[str, Any]] = parsed_json.get("attributes")
+        data: dict = {report_field["id"]: report_field["value"] for report_field in metrics}
         return data_model.model_validate(data, from_attributes=True)
+
+    def parse_smrtlink_datasets_file(self) -> SmrtlinkDatasetsMetrics:
+        """Parse the SMRTlink datasets report file."""
+        parsed_json: dict = ReadFile.read_file[FileFormat.JSON](self.smrtlink_datasets_report_file)
+        data: dict = parsed_json[0]
+        return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True)
diff --git a/cg/services/pacbio/metrics/models.py b/cg/services/pacbio/metrics/models.py
@@ -1,13 +1,19 @@
-from pydantic import BaseModel, Field, field_validator
+import re
+from typing import Any, TypeVar
+
+from pydantic import BaseModel, Field, field_validator, model_validator
 
 from cg.constants.pacbio import (
     CCSAttributeIDs,
     ControlAttributeIDs,
     LoadingAttributesIDs,
     PolymeraseDataAttributeIDs,
+    SmrtLinkDatabasesIDs,
 )
 from cg.utils.calculations import divide_by_thousand_with_one_decimal, fraction_to_percent
 
+BaseMetrics = TypeVar("BaseMetrics", bound=BaseModel)
+
 
 class HiFiMetrics(BaseModel):
     """Model for the HiFi metrics."""
@@ -101,3 +107,28 @@ class PolymeraseMetrics(BaseModel):
     _validate_longest_subread_length_n50 = field_validator(
         "longest_subread_length_n50", mode="before"
     )(divide_by_thousand_with_one_decimal)
+
+
+class SmrtlinkDatasetsMetrics(BaseModel):
+    """Model to parse metrics in the SMRTlink datasets report."""
+
+    device_internal_id: str = Field(..., alias=SmrtLinkDatabasesIDs.CELL_ID)
+    well: str = Field(..., alias=SmrtLinkDatabasesIDs.WELL_NAME)
+    well_sample_name: str = Field(..., alias=SmrtLinkDatabasesIDs.WELL_SAMPLE_NAME)
+    sample_internal_id: str = Field(..., alias=SmrtLinkDatabasesIDs.BIO_SAMPLE_NAME)
+    movie_name: str = Field(..., alias=SmrtLinkDatabasesIDs.MOVIE_NAME)
+    cell_index: int = Field(..., alias=SmrtLinkDatabasesIDs.CELL_INDEX)
+    path: str = Field(..., alias=SmrtLinkDatabasesIDs.PATH)
+    plate: int
+
+    @model_validator(mode="before")
+    @classmethod
+    def extract_plate(cls, data: Any):
+        if isinstance(data, dict):
+            path = data.get("path")
+            if path:
+                pattern = r"/([12])_[ABCD]01"
+                match = re.search(pattern, path)
+                if match:
+                    data["plate"] = match.group(1)
+        return data
diff --git a/tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py b/tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py
@@ -7,12 +7,14 @@
     ControlAttributeIDs,
     LoadingAttributesIDs,
     PolymeraseDataAttributeIDs,
+    SmrtLinkDatabasesIDs,
 )
 from cg.services.pacbio.metrics.models import (
     ControlMetrics,
     HiFiMetrics,
     PolymeraseMetrics,
     ProductivityMetrics,
+    SmrtlinkDatasetsMetrics,
 )
 
 
@@ -61,3 +63,17 @@ def pac_bio_polymerase_metrics() -> PolymeraseMetrics:
         PolymeraseDataAttributeIDs.LONGEST_SUBREAD_LENGTH_N50: 22250,
     }
     return PolymeraseMetrics.model_validate(data, from_attributes=True)
+
+
+@pytest.fixture
+def pac_bio_smrtlink_databases_metrics() -> SmrtlinkDatasetsMetrics:
+    data: dict[str, Any] = {
+        SmrtLinkDatabasesIDs.BIO_SAMPLE_NAME: "1247014000119",
+        SmrtLinkDatabasesIDs.CELL_ID: "EA094834",
+        SmrtLinkDatabasesIDs.CELL_INDEX: 0,
+        SmrtLinkDatabasesIDs.MOVIE_NAME: "m84202_240522_135641_s1",
+        SmrtLinkDatabasesIDs.PATH: "/srv/cg_data/pacbio/r84202_20240522_133539/1_A01/pb_formats/m84202_240522_135641_s1.hifi_reads.consensusreadset.xml",
+        SmrtLinkDatabasesIDs.WELL_NAME: "A01",
+        SmrtLinkDatabasesIDs.WELL_SAMPLE_NAME: "1247014000119",
+    }
+    return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True)
diff --git a/...s/devices/pacbio/SMRTcells/r84202_20240522_133539/1_A01/statistics/smrtlink-datasets.json b/...s/devices/pacbio/SMRTcells/r84202_20240522_133539/1_A01/statistics/smrtlink-datasets.json
@@ -0,0 +1,33 @@
+[
+  {
+    "id": -1,
+    "uuid": "a662e714-45f2-4cf3-b8d2-c138e1deebd0",
+    "name": "1247014000119-Cell1",
+    "path": "/srv/cg_data/pacbio/r84202_20240522_133539/1_A01/pb_formats/m84202_240522_135641_s1.hifi_reads.consensusreadset.xml",
+    "createdAt": "2024-05-24T02:21:20.970Z",
+    "updatedAt": "2024-05-24T02:21:20.970Z",
+    "importedAt": "2024-05-24T02:21:20.970Z",
+    "numRecords": 6580977,
+    "totalLength": 106275091861,
+    "version": "3.0.1",
+    "comments": "Record generated by runqc-reports",
+    "tags": "ccs",
+    "md5": "b1e427a733653fddbad7c273996637f0",
+    "instrumentName": "Wilma",
+    "instrumentControlVersion": "13.0.0.212033",
+    "metadataContextId": "m84202_240522_135641_s1",
+    "wellSampleName": "1247014000119",
+    "wellName": "A01",
+    "bioSampleName": "1247014000119",
+    "cellIndex": 0,
+    "cellId": "EA094834",
+    "runName": "Run 240515",
+    "createdBy": "admin",
+    "jobId": -99,
+    "projectId": -99,
+    "isActive": true,
+    "numChildren": 0,
+    "numResources": 1,
+    "datasetType": "PacBio.DataSet.ConsensusReadSet"
+  }
+]
diff --git a/tests/services/pacbio_services/pacbio_metrics_service/test_pacbio_metrics_service.py b/tests/services/pacbio_services/pacbio_metrics_service/test_pacbio_metrics_service.py
@@ -1,17 +1,19 @@
 import math
 from pathlib import Path
-from typing import Any, Type
+from typing import Type
 
 import pytest
 from _pytest.fixtures import FixtureRequest
 
 from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
 from cg.services.pacbio.metrics.metrics_parser import MetricsParser
 from cg.services.pacbio.metrics.models import (
+    BaseMetrics,
     ControlMetrics,
     HiFiMetrics,
     PolymeraseMetrics,
     ProductivityMetrics,
+    SmrtlinkDatasetsMetrics,
 )
 
 
@@ -22,39 +24,12 @@ def test_metrics_parser_initialisation(pac_bio_smrt_cell_dir: Path):
     # WHEN initialising the metrics parser
     parser = MetricsParser(smrt_cell_path=pac_bio_smrt_cell_dir)
 
-    # THEN assert that the parser is initialised with the expected attributes
+    # THEN the parser is initialised with the expected attributes
     assert isinstance(parser.hifi_metrics, HiFiMetrics)
     assert isinstance(parser.control_metrics, ControlMetrics)
     assert isinstance(parser.productivity_metrics, ProductivityMetrics)
     assert isinstance(parser.polymerase_metrics, PolymeraseMetrics)
-
-
-@pytest.mark.parametrize(
-    "report_file_path",
-    [
-        "pac_bio_control_report",
-        "pac_bio_css_report",
-        "pac_bio_loading_report",
-        "pac_bio_raw_data_report",
-    ],
-)
-def test_parse_attributes_from_json(
-    pac_bio_metrics_parser: MetricsParser,
-    report_file_path: str,
-    request: FixtureRequest,
-):
-    """Test the parsing of attributes from any PacBio report file."""
-    # GIVEN a PacBio report file and a PacBio metrics parser initialised from the SMRTcell path
-    report_file: Path = request.getfixturevalue(report_file_path)
-
-    # WHEN parsing the report file
-    attributes: list[dict[str, Any]] = pac_bio_metrics_parser._parse_report(report_file=report_file)
-
-    # THEN assert that the report attributes are parsed correctly
-    assert isinstance(attributes, list)
-    assert isinstance(attributes[0], dict)
-    assert "id" in attributes[0]
-    assert "value" in attributes[0]
+    assert isinstance(parser.smrtlink_datasets_metrics, SmrtlinkDatasetsMetrics)
 
 
 @pytest.mark.parametrize(
@@ -75,10 +50,10 @@ def test_parse_attributes_from_json(
     ],
     ids=["Control", "Hi-Fi", "Polymerase", "Productivity"],
 )
-def test_parse_attributes_to_model(
+def test_parse_report_to_model(
     pac_bio_metrics_parser: MetricsParser,
     report_file_path: str,
-    model: Type[ControlMetrics | HiFiMetrics | ProductivityMetrics],
+    model: Type[BaseMetrics],
     metrics_fixture: str,
     percent_fields: list[str],
     request: FixtureRequest,
@@ -90,27 +65,38 @@ def test_parse_attributes_to_model(
     report_file: Path = request.getfixturevalue(report_file_path)
 
     # GIVEN a metrics object with the expected parsed metrics
-    expected_metrics: ControlMetrics | HiFiMetrics | ProductivityMetrics = request.getfixturevalue(
-        metrics_fixture
-    )
+    expected_metrics: BaseMetrics = request.getfixturevalue(metrics_fixture)
 
     # WHEN parsing the attributes to a given metrics model
-    parsed_metrics: ControlMetrics | HiFiMetrics | ProductivityMetrics = (
-        pac_bio_metrics_parser.parse_attributes_to_model(
-            report_file=report_file,
-            data_model=model,
-        )
+    parsed_metrics: BaseMetrics = pac_bio_metrics_parser.parse_report_to_model(
+        report_file=report_file, data_model=model
     )
 
-    # THEN assert that the model attributes are the expected ones
+    # THEN the model attributes are the expected ones
     assert parsed_metrics == expected_metrics
 
-    # THEN assert that the percentage fields of the model are not taken as a fraction
+    # THEN the percentage fields of the model are not taken as a fraction
     metrics_dict: dict = parsed_metrics.dict(by_alias=True)
     for percent_field in percent_fields:
         assert metrics_dict.get(percent_field) > 1
 
 
+def test_parse_smrtlink_datasets_file(
+    pac_bio_metrics_parser: MetricsParser,
+    pac_bio_smrtlink_databases_metrics: SmrtlinkDatasetsMetrics,
+):
+    """Test to parse the SMRTlink datasets file."""
+    # GIVEN a metrics parser
+
+    # WHEN parsing the SMRTlink datasets file
+    smrtlink_datasets_metrics: SmrtlinkDatasetsMetrics = (
+        pac_bio_metrics_parser.parse_smrtlink_datasets_file()
+    )
+
+    # THEN the parsed metrics are the expected ones
+    assert smrtlink_datasets_metrics == pac_bio_smrtlink_databases_metrics
+
+
 def test_productivity_metrics_percentage_attributes(
     pac_bio_productivity_metrics: ProductivityMetrics,
 ):
@@ -122,7 +108,7 @@ def test_productivity_metrics_percentage_attributes(
     percentage_p_1: float = pac_bio_productivity_metrics.percentage_p_1
     percentage_p_2: float = pac_bio_productivity_metrics.percentage_p_2
 
-    # THEN assert that the percentage attributes are calculated correctly
+    # THEN the percentage attributes are calculated correctly
     assert math.isclose(percentage_p_0, 40, abs_tol=1e-9)
     assert math.isclose(percentage_p_1, 60, abs_tol=1e-9)
     assert math.isclose(percentage_p_2, 0, abs_tol=1e-9)