From 755c712ae05f11580703cb99eba20d64fcf3aa4d Mon Sep 17 00:00:00 2001
From: Sebastian Diaz <juan.sebastian.diaz.boada@scilifelab.se>
Date: Wed, 17 Jul 2024 12:45:49 +0200
Subject: [PATCH] Parse metrics from smrtlnk datasets file (#3436)(patch)

## Description
Closes https://github.com/Clinical-Genomics/add-new-tech/issues/63
Parses the final metrics for PacBio. This last metrics file has a structure different from the previous ones. The desired metrics are not inside an `attributes` section in the json, but in the only existing section. Parsing for the other metrics had to be modified to account for the parsing of this file and make it more DRY.

### Added

- Metrics model
- Function to parse specifically the smrtlink-datasets file in the metrics parser
- Constants for metrics names
- Fixture for parsed metrics and fixture file

### Changed

- Merged `_parse_report` and `parse_attributes_to_model` methods of the metrics parser into one, called `parse_report_to_model`.
- Updated tests of these functions

### Fixed

- Implemented `TypeVar` from the `typing` module to better type-hint the different metrics classes, so just typing the parent class (BaseModel from Pydantic)
---
 cg/constants/pacbio.py                        | 11 +++
 cg/services/pacbio/metrics/metrics_parser.py  | 39 +++++-----
 cg/services/pacbio/metrics/models.py          | 33 ++++++++-
 .../pacbio_fixtures/metrics_fixtures.py       | 16 +++++
 .../1_A01/statistics/smrtlink-datasets.json   | 33 +++++++++
 .../test_pacbio_metrics_service.py            | 72 ++++++++-----------
 6 files changed, 144 insertions(+), 60 deletions(-)
 create mode 100644 tests/fixtures/devices/pacbio/SMRTcells/r84202_20240522_133539/1_A01/statistics/smrtlink-datasets.json

diff --git a/cg/constants/pacbio.py b/cg/constants/pacbio.py
index 042d96b3ec..61f8580fa5 100644
--- a/cg/constants/pacbio.py
+++ b/cg/constants/pacbio.py
@@ -7,6 +7,7 @@ class PacBioDirsAndFiles:
     CONTROL_REPORT: str = "control.report.json"
     LOADING_REPORT: str = "loading.report.json"
     RAW_DATA_REPORT: str = "raw_data.report.json"
+    SMRTLINK_DATASETS_REPORT: str = "smrtlink-datasets.json"
 
 
 class CCSAttributeIDs:
@@ -38,3 +39,13 @@ class PolymeraseDataAttributeIDs:
     READ_LENGTH_N50: str = "raw_data_report.read_n50"
     MEAN_LONGEST_SUBREAD_LENGTH: str = "raw_data_report.insert_length"
     LONGEST_SUBREAD_LENGTH_N50: str = "raw_data_report.insert_n50"
+
+
+class SmrtLinkDatabasesIDs:
+    BIO_SAMPLE_NAME: str = "bioSampleName"
+    CELL_ID: str = "cellId"
+    CELL_INDEX: str = "cellIndex"
+    MOVIE_NAME: str = "metadataContextId"
+    PATH: str = "path"
+    WELL_NAME: str = "wellName"
+    WELL_SAMPLE_NAME: str = "wellSampleName"
diff --git a/cg/services/pacbio/metrics/metrics_parser.py b/cg/services/pacbio/metrics/metrics_parser.py
index 1b693e78a8..989dbcf9e7 100644
--- a/cg/services/pacbio/metrics/metrics_parser.py
+++ b/cg/services/pacbio/metrics/metrics_parser.py
@@ -5,10 +5,12 @@
 from cg.constants.pacbio import PacBioDirsAndFiles
 from cg.io.controller import ReadFile
 from cg.services.pacbio.metrics.models import (
+    BaseMetrics,
     ControlMetrics,
     HiFiMetrics,
     PolymeraseMetrics,
     ProductivityMetrics,
+    SmrtlinkDatasetsMetrics,
 )
 from cg.utils.files import get_file_in_directory
 
@@ -23,43 +25,48 @@ def __init__(self, smrt_cell_path: Path) -> None:
         self.base_calling_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.BASECALLING_REPORT
         )
-        self.hifi_metrics: HiFiMetrics = self.parse_attributes_to_model(
+        self.hifi_metrics: HiFiMetrics = self.parse_report_to_model(
             report_file=self.base_calling_report_file, data_model=HiFiMetrics
         )
         # For control metrics
         self.control_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.CONTROL_REPORT
         )
-        self.control_metrics: ControlMetrics = self.parse_attributes_to_model(
+        self.control_metrics: ControlMetrics = self.parse_report_to_model(
             report_file=self.control_report_file, data_model=ControlMetrics
         )
         # For productivity metrics
         self.loading_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.LOADING_REPORT
         )
-        self.productivity_metrics: ProductivityMetrics = self.parse_attributes_to_model(
+        self.productivity_metrics: ProductivityMetrics = self.parse_report_to_model(
             report_file=self.loading_report_file, data_model=ProductivityMetrics
         )
         # For polymerase metrics
         self.raw_data_report_file: Path = get_file_in_directory(
             directory=self.report_dir, file_name=PacBioDirsAndFiles.RAW_DATA_REPORT
         )
-        self.polymerase_metrics: PolymeraseMetrics = self.parse_attributes_to_model(
+        self.polymerase_metrics: PolymeraseMetrics = self.parse_report_to_model(
             report_file=self.raw_data_report_file, data_model=PolymeraseMetrics
         )
+        # For SMRTlink datasets metrics
+        self.smrtlink_datasets_report_file: Path = get_file_in_directory(
+            directory=self.report_dir, file_name=PacBioDirsAndFiles.SMRTLINK_DATASETS_REPORT
+        )
+        self.smrtlink_datasets_metrics: SmrtlinkDatasetsMetrics = (
+            self.parse_smrtlink_datasets_file()
+        )
 
     @staticmethod
-    def _parse_report(report_file: Path) -> list[dict[str, Any]]:
-        """Parse the attribute element of a PacBio report file in JSON format."""
+    def parse_report_to_model(report_file: Path, data_model: Type[BaseMetrics]) -> BaseMetrics:
+        """Parse the metrics report to a data model."""
         parsed_json: dict = ReadFile.read_file[FileFormat.JSON](file_path=report_file)
-        return parsed_json.get("attributes")
-
-    def parse_attributes_to_model(
-        self,
-        report_file: Path,
-        data_model: Type[ControlMetrics | HiFiMetrics | PolymeraseMetrics | ProductivityMetrics],
-    ) -> ControlMetrics | HiFiMetrics | PolymeraseMetrics | ProductivityMetrics:
-        """Parse the attributes to a model."""
-        report_content: list[dict[str, Any]] = self._parse_report(report_file=report_file)
-        data: dict = {report_field["id"]: report_field["value"] for report_field in report_content}
+        metrics: list[dict[str, Any]] = parsed_json.get("attributes")
+        data: dict = {report_field["id"]: report_field["value"] for report_field in metrics}
         return data_model.model_validate(data, from_attributes=True)
+
+    def parse_smrtlink_datasets_file(self) -> SmrtlinkDatasetsMetrics:
+        """Parse the SMRTlink datasets report file."""
+        parsed_json: dict = ReadFile.read_file[FileFormat.JSON](self.smrtlink_datasets_report_file)
+        data: dict = parsed_json[0]
+        return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True)
diff --git a/cg/services/pacbio/metrics/models.py b/cg/services/pacbio/metrics/models.py
index 1655466099..d82f2095f4 100644
--- a/cg/services/pacbio/metrics/models.py
+++ b/cg/services/pacbio/metrics/models.py
@@ -1,13 +1,19 @@
-from pydantic import BaseModel, Field, field_validator
+import re
+from typing import Any, TypeVar
+
+from pydantic import BaseModel, Field, field_validator, model_validator
 
 from cg.constants.pacbio import (
     CCSAttributeIDs,
     ControlAttributeIDs,
     LoadingAttributesIDs,
     PolymeraseDataAttributeIDs,
+    SmrtLinkDatabasesIDs,
 )
 from cg.utils.calculations import divide_by_thousand_with_one_decimal, fraction_to_percent
 
+BaseMetrics = TypeVar("BaseMetrics", bound=BaseModel)
+
 
 class HiFiMetrics(BaseModel):
     """Model for the HiFi metrics."""
@@ -101,3 +107,28 @@ class PolymeraseMetrics(BaseModel):
     _validate_longest_subread_length_n50 = field_validator(
         "longest_subread_length_n50", mode="before"
     )(divide_by_thousand_with_one_decimal)
+
+
+class SmrtlinkDatasetsMetrics(BaseModel):
+    """Model to parse metrics in the SMRTlink datasets report."""
+
+    device_internal_id: str = Field(..., alias=SmrtLinkDatabasesIDs.CELL_ID)
+    well: str = Field(..., alias=SmrtLinkDatabasesIDs.WELL_NAME)
+    well_sample_name: str = Field(..., alias=SmrtLinkDatabasesIDs.WELL_SAMPLE_NAME)
+    sample_internal_id: str = Field(..., alias=SmrtLinkDatabasesIDs.BIO_SAMPLE_NAME)
+    movie_name: str = Field(..., alias=SmrtLinkDatabasesIDs.MOVIE_NAME)
+    cell_index: int = Field(..., alias=SmrtLinkDatabasesIDs.CELL_INDEX)
+    path: str = Field(..., alias=SmrtLinkDatabasesIDs.PATH)
+    plate: int
+
+    @model_validator(mode="before")
+    @classmethod
+    def extract_plate(cls, data: Any):
+        if isinstance(data, dict):
+            path = data.get("path")
+            if path:
+                pattern = r"/([12])_[ABCD]01"
+                match = re.search(pattern, path)
+                if match:
+                    data["plate"] = match.group(1)
+        return data
diff --git a/tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py b/tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py
index 93d566c795..57face56d3 100644
--- a/tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py
+++ b/tests/fixture_plugins/pacbio_fixtures/metrics_fixtures.py
@@ -7,12 +7,14 @@
     ControlAttributeIDs,
     LoadingAttributesIDs,
     PolymeraseDataAttributeIDs,
+    SmrtLinkDatabasesIDs,
 )
 from cg.services.pacbio.metrics.models import (
     ControlMetrics,
     HiFiMetrics,
     PolymeraseMetrics,
     ProductivityMetrics,
+    SmrtlinkDatasetsMetrics,
 )
 
 
@@ -61,3 +63,17 @@ def pac_bio_polymerase_metrics() -> PolymeraseMetrics:
         PolymeraseDataAttributeIDs.LONGEST_SUBREAD_LENGTH_N50: 22250,
     }
     return PolymeraseMetrics.model_validate(data, from_attributes=True)
+
+
+@pytest.fixture
+def pac_bio_smrtlink_databases_metrics() -> SmrtlinkDatasetsMetrics:
+    data: dict[str, Any] = {
+        SmrtLinkDatabasesIDs.BIO_SAMPLE_NAME: "1247014000119",
+        SmrtLinkDatabasesIDs.CELL_ID: "EA094834",
+        SmrtLinkDatabasesIDs.CELL_INDEX: 0,
+        SmrtLinkDatabasesIDs.MOVIE_NAME: "m84202_240522_135641_s1",
+        SmrtLinkDatabasesIDs.PATH: "/srv/cg_data/pacbio/r84202_20240522_133539/1_A01/pb_formats/m84202_240522_135641_s1.hifi_reads.consensusreadset.xml",
+        SmrtLinkDatabasesIDs.WELL_NAME: "A01",
+        SmrtLinkDatabasesIDs.WELL_SAMPLE_NAME: "1247014000119",
+    }
+    return SmrtlinkDatasetsMetrics.model_validate(data, from_attributes=True)
diff --git a/tests/fixtures/devices/pacbio/SMRTcells/r84202_20240522_133539/1_A01/statistics/smrtlink-datasets.json b/tests/fixtures/devices/pacbio/SMRTcells/r84202_20240522_133539/1_A01/statistics/smrtlink-datasets.json
new file mode 100644
index 0000000000..21f62e3565
--- /dev/null
+++ b/tests/fixtures/devices/pacbio/SMRTcells/r84202_20240522_133539/1_A01/statistics/smrtlink-datasets.json
@@ -0,0 +1,33 @@
+[
+  {
+    "id": -1,
+    "uuid": "a662e714-45f2-4cf3-b8d2-c138e1deebd0",
+    "name": "1247014000119-Cell1",
+    "path": "/srv/cg_data/pacbio/r84202_20240522_133539/1_A01/pb_formats/m84202_240522_135641_s1.hifi_reads.consensusreadset.xml",
+    "createdAt": "2024-05-24T02:21:20.970Z",
+    "updatedAt": "2024-05-24T02:21:20.970Z",
+    "importedAt": "2024-05-24T02:21:20.970Z",
+    "numRecords": 6580977,
+    "totalLength": 106275091861,
+    "version": "3.0.1",
+    "comments": "Record generated by runqc-reports",
+    "tags": "ccs",
+    "md5": "b1e427a733653fddbad7c273996637f0",
+    "instrumentName": "Wilma",
+    "instrumentControlVersion": "13.0.0.212033",
+    "metadataContextId": "m84202_240522_135641_s1",
+    "wellSampleName": "1247014000119",
+    "wellName": "A01",
+    "bioSampleName": "1247014000119",
+    "cellIndex": 0,
+    "cellId": "EA094834",
+    "runName": "Run 240515",
+    "createdBy": "admin",
+    "jobId": -99,
+    "projectId": -99,
+    "isActive": true,
+    "numChildren": 0,
+    "numResources": 1,
+    "datasetType": "PacBio.DataSet.ConsensusReadSet"
+  }
+]
\ No newline at end of file
diff --git a/tests/services/pacbio_services/pacbio_metrics_service/test_pacbio_metrics_service.py b/tests/services/pacbio_services/pacbio_metrics_service/test_pacbio_metrics_service.py
index 463200272e..8f27933466 100644
--- a/tests/services/pacbio_services/pacbio_metrics_service/test_pacbio_metrics_service.py
+++ b/tests/services/pacbio_services/pacbio_metrics_service/test_pacbio_metrics_service.py
@@ -1,6 +1,6 @@
 import math
 from pathlib import Path
-from typing import Any, Type
+from typing import Type
 
 import pytest
 from _pytest.fixtures import FixtureRequest
@@ -8,10 +8,12 @@
 from cg.constants.pacbio import CCSAttributeIDs, ControlAttributeIDs
 from cg.services.pacbio.metrics.metrics_parser import MetricsParser
 from cg.services.pacbio.metrics.models import (
+    BaseMetrics,
     ControlMetrics,
     HiFiMetrics,
     PolymeraseMetrics,
     ProductivityMetrics,
+    SmrtlinkDatasetsMetrics,
 )
 
 
@@ -22,39 +24,12 @@ def test_metrics_parser_initialisation(pac_bio_smrt_cell_dir: Path):
     # WHEN initialising the metrics parser
     parser = MetricsParser(smrt_cell_path=pac_bio_smrt_cell_dir)
 
-    # THEN assert that the parser is initialised with the expected attributes
+    # THEN the parser is initialised with the expected attributes
     assert isinstance(parser.hifi_metrics, HiFiMetrics)
     assert isinstance(parser.control_metrics, ControlMetrics)
     assert isinstance(parser.productivity_metrics, ProductivityMetrics)
     assert isinstance(parser.polymerase_metrics, PolymeraseMetrics)
-
-
-@pytest.mark.parametrize(
-    "report_file_path",
-    [
-        "pac_bio_control_report",
-        "pac_bio_css_report",
-        "pac_bio_loading_report",
-        "pac_bio_raw_data_report",
-    ],
-)
-def test_parse_attributes_from_json(
-    pac_bio_metrics_parser: MetricsParser,
-    report_file_path: str,
-    request: FixtureRequest,
-):
-    """Test the parsing of attributes from any PacBio report file."""
-    # GIVEN a PacBio report file and a PacBio metrics parser initialised from the SMRTcell path
-    report_file: Path = request.getfixturevalue(report_file_path)
-
-    # WHEN parsing the report file
-    attributes: list[dict[str, Any]] = pac_bio_metrics_parser._parse_report(report_file=report_file)
-
-    # THEN assert that the report attributes are parsed correctly
-    assert isinstance(attributes, list)
-    assert isinstance(attributes[0], dict)
-    assert "id" in attributes[0]
-    assert "value" in attributes[0]
+    assert isinstance(parser.smrtlink_datasets_metrics, SmrtlinkDatasetsMetrics)
 
 
 @pytest.mark.parametrize(
@@ -75,10 +50,10 @@ def test_parse_attributes_from_json(
     ],
     ids=["Control", "Hi-Fi", "Polymerase", "Productivity"],
 )
-def test_parse_attributes_to_model(
+def test_parse_report_to_model(
     pac_bio_metrics_parser: MetricsParser,
     report_file_path: str,
-    model: Type[ControlMetrics | HiFiMetrics | ProductivityMetrics],
+    model: Type[BaseMetrics],
     metrics_fixture: str,
     percent_fields: list[str],
     request: FixtureRequest,
@@ -90,27 +65,38 @@ def test_parse_attributes_to_model(
     report_file: Path = request.getfixturevalue(report_file_path)
 
     # GIVEN a metrics object with the expected parsed metrics
-    expected_metrics: ControlMetrics | HiFiMetrics | ProductivityMetrics = request.getfixturevalue(
-        metrics_fixture
-    )
+    expected_metrics: BaseMetrics = request.getfixturevalue(metrics_fixture)
 
     # WHEN parsing the attributes to a given metrics model
-    parsed_metrics: ControlMetrics | HiFiMetrics | ProductivityMetrics = (
-        pac_bio_metrics_parser.parse_attributes_to_model(
-            report_file=report_file,
-            data_model=model,
-        )
+    parsed_metrics: BaseMetrics = pac_bio_metrics_parser.parse_report_to_model(
+        report_file=report_file, data_model=model
     )
 
-    # THEN assert that the model attributes are the expected ones
+    # THEN the model attributes are the expected ones
     assert parsed_metrics == expected_metrics
 
-    # THEN assert that the percentage fields of the model are not taken as a fraction
+    # THEN the percentage fields of the model are not taken as a fraction
     metrics_dict: dict = parsed_metrics.dict(by_alias=True)
     for percent_field in percent_fields:
         assert metrics_dict.get(percent_field) > 1
 
 
+def test_parse_smrtlink_datasets_file(
+    pac_bio_metrics_parser: MetricsParser,
+    pac_bio_smrtlink_databases_metrics: SmrtlinkDatasetsMetrics,
+):
+    """Test to parse the SMRTlink datasets file."""
+    # GIVEN a metrics parser
+
+    # WHEN parsing the SMRTlink datasets file
+    smrtlink_datasets_metrics: SmrtlinkDatasetsMetrics = (
+        pac_bio_metrics_parser.parse_smrtlink_datasets_file()
+    )
+
+    # THEN the parsed metrics are the expected ones
+    assert smrtlink_datasets_metrics == pac_bio_smrtlink_databases_metrics
+
+
 def test_productivity_metrics_percentage_attributes(
     pac_bio_productivity_metrics: ProductivityMetrics,
 ):
@@ -122,7 +108,7 @@ def test_productivity_metrics_percentage_attributes(
     percentage_p_1: float = pac_bio_productivity_metrics.percentage_p_1
     percentage_p_2: float = pac_bio_productivity_metrics.percentage_p_2
 
-    # THEN assert that the percentage attributes are calculated correctly
+    # THEN the percentage attributes are calculated correctly
     assert math.isclose(percentage_p_0, 40, abs_tol=1e-9)
     assert math.isclose(percentage_p_1, 60, abs_tol=1e-9)
     assert math.isclose(percentage_p_2, 0, abs_tol=1e-9)