From e3739c0c87e3ac07f14b2fdf930e70b835961d47 Mon Sep 17 00:00:00 2001 From: ChristianOertlin Date: Thu, 23 May 2024 11:19:47 +0200 Subject: [PATCH] refactor(demux post processing run parameters) (#3263) (patch) # Descripion Refactoring of the run parameters and extraction of flow cell model from run paramters. --- cg/constants/metrics.py | 2 + cg/models/demultiplex/run_parameters.py | 39 +++++++++++++---- .../bcl_convert_metrics_service.py | 23 ++++++++++ .../illumina_post_processing_service.py | 2 +- .../illumina_post_processing_service/utils.py | 13 ------ cg/store/crud/create.py | 7 +++ .../run_parameters_fixtures.py | 4 +- .../demultiplexing/test_run_parameters.py | 28 +++++++++++- .../conftest.py | 26 ----------- .../test_illumina_post_processing_utils.py | 43 ------------------- 10 files changed, 93 insertions(+), 94 deletions(-) delete mode 100644 tests/services/illumina_post_processing_service/test_illumina_post_processing_utils.py diff --git a/cg/constants/metrics.py b/cg/constants/metrics.py index 7a4b320a20..a85e443b26 100644 --- a/cg/constants/metrics.py +++ b/cg/constants/metrics.py @@ -10,6 +10,8 @@ class QualityMetricsColumnNames(StrEnum): SAMPLE_INTERNAL_ID: str = "SampleID" MEAN_QUALITY_SCORE_Q30: str = "Mean Quality Score (PF)" Q30_BASES_PERCENT: str = "% Q30" + YIELD: str = "Yield" + YIELD_Q30: str = "YieldQ30" class DemuxMetricsColumnNames(StrEnum): diff --git a/cg/models/demultiplex/run_parameters.py b/cg/models/demultiplex/run_parameters.py index 4bb9143e67..87c84f1bf0 100644 --- a/cg/models/demultiplex/run_parameters.py +++ b/cg/models/demultiplex/run_parameters.py @@ -3,7 +3,7 @@ import logging from abc import abstractmethod from pathlib import Path -from xml.etree import ElementTree +from xml.etree.ElementTree import Element, ElementTree from packaging.version import parse @@ -44,9 +44,7 @@ def _validate_instrument(self, node_name: str, node_value: str): Raises: RunParametersError if the node does not have the expected value.""" try: - application: ElementTree.Element | None = get_tree_node( - tree=self.tree, node_name=node_name - ) + application: Element | None = get_tree_node(tree=self.tree, node_name=node_name) except XMLError: raise RunParametersError( f"Could not find node {node_name} in the run parameters file. " @@ -129,6 +127,10 @@ def _get_index_settings(self) -> IndexSettings: return NOVASEQ_6000_POST_1_5_KITS_INDEX_SETTINGS return NO_REVERSE_COMPLEMENTS_INDEX_SETTINGS + @abstractmethod + def get_flow_cell_model(self): + pass + def __str__(self): return f"RunParameters(path={self.path}, sequencer={self.sequencer})" @@ -189,6 +191,10 @@ def get_read_2_cycles(self) -> int: node_name: str = RunParametersXMLNodes.READ_2_HISEQ return self._get_node_integer_value(node_name=node_name) + def get_flow_cell_model(self) -> None: + """Return None for run parameters associated with HiSeq sequencing.""" + return None + class RunParametersNovaSeq6000(RunParameters): """Specific class for parsing run parameters of NovaSeq6000 sequencing.""" @@ -210,7 +216,7 @@ def control_software_version(self) -> str: def reagent_kit_version(self) -> str: """Return the reagent kit version if existent, return 'unknown' otherwise.""" node_name: str = RunParametersXMLNodes.REAGENT_KIT_VERSION - xml_node: ElementTree.Element | None = self.tree.find(node_name) + xml_node: Element | None = self.tree.find(node_name) if xml_node is None: LOG.warning("Could not determine reagent kit version") LOG.info("Set reagent kit version to 'unknown'") @@ -242,6 +248,11 @@ def get_read_2_cycles(self) -> int: node_name: str = RunParametersXMLNodes.READ_2_NOVASEQ_6000 return self._get_node_integer_value(node_name=node_name) + def get_flow_cell_model(self) -> str: + """Return the flow cell model referred to as 'FlowCellMode' in the run parameters file.""" + node_name: str = RunParametersXMLNodes.FLOW_CELL_MODE + return self._get_node_string_value(node_name=node_name) + class RunParametersNovaSeqX(RunParameters): """Specific class for parsing run parameters of NovaSeqX sequencing.""" @@ -272,12 +283,10 @@ def sequencer(self) -> str: def _read_parser(self) -> dict[str, int]: """Return read and index cycle values parsed as a dictionary.""" cycle_mapping: dict[str, int] = {} - planned_reads_tree: ElementTree.Element = get_tree_node( + planned_reads_tree: Element = get_tree_node( tree=self.tree, node_name=RunParametersXMLNodes.PLANNED_READS_NOVASEQ_X ) - planned_reads: list[ElementTree.Element] = planned_reads_tree.findall( - RunParametersXMLNodes.INNER_READ - ) + planned_reads: list[Element] = planned_reads_tree.findall(RunParametersXMLNodes.INNER_READ) for read_elem in planned_reads: read_name: str = read_elem.get(RunParametersXMLNodes.READ_NAME) cycles: int = int(read_elem.get(RunParametersXMLNodes.CYCLES)) @@ -299,3 +308,15 @@ def get_read_1_cycles(self) -> int: def get_read_2_cycles(self) -> int: """Return the number of cycles in the second read.""" return self._read_parser.get(RunParametersXMLNodes.READ_2_NOVASEQ_X) + + def get_flow_cell_model(self) -> str: + """Return the flow cell model referred to as 'Mode' or 'Name' in the run parameters file.""" + consumable_infos: list[Element] = self.tree.findall(".//ConsumableInfo") + for consumable_info in consumable_infos: + type_element: Element | None = consumable_info.find("Type") + if type_element is not None and type_element.text == "FlowCell": + name_element: Element | None = consumable_info.find("Name") or consumable_info.find( + "Mode" + ) + if name_element is not None: + return name_element.text diff --git a/cg/services/bcl_convert_metrics_service/bcl_convert_metrics_service.py b/cg/services/bcl_convert_metrics_service/bcl_convert_metrics_service.py index ab3f186df3..11724d01fb 100644 --- a/cg/services/bcl_convert_metrics_service/bcl_convert_metrics_service.py +++ b/cg/services/bcl_convert_metrics_service/bcl_convert_metrics_service.py @@ -111,3 +111,26 @@ def create_sample_run_metrics( base_mean_quality_score=mean_quality_score, created_at=datetime.now(), ) + + def create_sample_sequencing_metrics_for_flow_cell( + self, + flow_cell_directory: Path, + run_metrics_id: int, + store: Store, + ) -> list[IlluminaSampleSequencingMetrics]: + """Parse the demultiplexing metrics data into the sequencing statistics model.""" + metrics_parser = MetricsParser(flow_cell_directory) + sample_internal_ids: list[str] = metrics_parser.get_sample_internal_ids() + sample_lane_sequencing_metrics: list[IlluminaSampleSequencingMetrics] = [] + + for sample_internal_id in sample_internal_ids: + for lane in metrics_parser.get_lanes_for_sample(sample_internal_id=sample_internal_id): + metrics: IlluminaSampleSequencingMetrics = self.create_sample_run_metrics( + sample_internal_id=sample_internal_id, + lane=lane, + metrics_parser=metrics_parser, + run_metrics_id=run_metrics_id, + store=store, + ) + sample_lane_sequencing_metrics.append(metrics) + return sample_lane_sequencing_metrics diff --git a/cg/services/illumina_post_processing_service/illumina_post_processing_service.py b/cg/services/illumina_post_processing_service/illumina_post_processing_service.py index 3b02987437..8f49724ac3 100644 --- a/cg/services/illumina_post_processing_service/illumina_post_processing_service.py +++ b/cg/services/illumina_post_processing_service/illumina_post_processing_service.py @@ -36,7 +36,7 @@ def store_illumina_flow_cell( Create flow cell from the parsed and validated flow cell data. And add the samples on the flow cell to the model. """ - model: str | None = get_flow_cell_model_from_run_parameters(flow_cell.run_parameters_path) + model: str | None = flow_cell.run_parameters.get_flow_cell_model() new_flow_cell = IlluminaFlowCell( internal_id=flow_cell.id, type=DeviceType.ILLUMINA, model=model ) diff --git a/cg/services/illumina_post_processing_service/utils.py b/cg/services/illumina_post_processing_service/utils.py index 2b2874dd7a..f070fdbca0 100644 --- a/cg/services/illumina_post_processing_service/utils.py +++ b/cg/services/illumina_post_processing_service/utils.py @@ -10,16 +10,3 @@ def create_delivery_file_in_flow_cell_directory(flow_cell_directory: Path) -> None: Path(flow_cell_directory, DemultiplexingDirsAndFiles.DELIVERY).touch() - - -def get_flow_cell_model_from_run_parameters(run_parameters_path: Path) -> str | None: - """Return the model of the flow cell.""" - xml_tree: ElementTree = read_xml(run_parameters_path) - node: Element | None = None - for node_name in [RunParametersXMLNodes.MODE, RunParametersXMLNodes.FLOW_CELL_MODE]: - try: - node: Element = get_tree_node(xml_tree, node_name) - return node.text - except XMLError: - continue - return node diff --git a/cg/store/crud/create.py b/cg/store/crud/create.py index addc51a051..b8920f5fd9 100644 --- a/cg/store/crud/create.py +++ b/cg/store/crud/create.py @@ -31,6 +31,7 @@ User, order_case, IlluminaFlowCell, + IlluminaSequencingRun, ) LOG = logging.getLogger(__name__) @@ -425,3 +426,9 @@ def add_illumina_flow_cell(self, flow_cell: IlluminaFlowCell) -> IlluminaFlowCel session.add(flow_cell) LOG.debug(f"Flow cell added to status db: {flow_cell.id}.") return flow_cell + + def add_illumina_sequencing_metrics( + self, sequencing_metrics: IlluminaSequencingRun + ) -> IlluminaSequencingRun: + """Add a new Illumina flow cell to the status database as a pending transaction.""" + pass diff --git a/tests/fixture_plugins/demultiplex_fixtures/run_parameters_fixtures.py b/tests/fixture_plugins/demultiplex_fixtures/run_parameters_fixtures.py index 32b92c8b00..f5694f15a4 100644 --- a/tests/fixture_plugins/demultiplex_fixtures/run_parameters_fixtures.py +++ b/tests/fixture_plugins/demultiplex_fixtures/run_parameters_fixtures.py @@ -26,7 +26,9 @@ def run_parameters_novaseq_6000_different_index( @pytest.fixture(scope="function") -def run_parameters_novaseq_x_different_index(run_parameters_dir: Path) -> RunParametersNovaSeqX: +def run_parameters_novaseq_x_different_index( + run_parameters_dir: Path, +) -> RunParametersNovaSeqX: """Return a NovaSeqX RunParameters object with different index cycles.""" path = Path(run_parameters_dir, "RunParameters_novaseq_X_different_index_cycles.xml") return RunParametersNovaSeqX(run_parameters_path=path) diff --git a/tests/models/demultiplexing/test_run_parameters.py b/tests/models/demultiplexing/test_run_parameters.py index dad052215b..d99bbf072d 100644 --- a/tests/models/demultiplexing/test_run_parameters.py +++ b/tests/models/demultiplexing/test_run_parameters.py @@ -51,7 +51,11 @@ def test_run_parameters_parent_class_fails( "run_parameters_path, constructor, sequencer", [ ("hiseq_x_single_index_run_parameters_path", RunParametersHiSeq, Sequencers.HISEQX), - ("hiseq_2500_dual_index_run_parameters_path", RunParametersHiSeq, Sequencers.HISEQGA), + ( + "hiseq_2500_dual_index_run_parameters_path", + RunParametersHiSeq, + Sequencers.HISEQGA, + ), ( "novaseq_6000_run_parameters_pre_1_5_kits_path", RunParametersNovaSeq6000, @@ -301,3 +305,25 @@ def test_get_index_settings( settings: IndexSettings = flow_cell.run_parameters.index_settings # THEN the correct index settings are returned assert settings == correct_settings + + +@pytest.mark.parametrize( + "run_parameters_fixture, expected_result", + [ + ("hiseq_x_single_index_run_parameters", None), + ("hiseq_2500_dual_index_run_parameters", None), + ("novaseq_6000_run_parameters_pre_1_5_kits", "S4"), + ("novaseq_6000_run_parameters_post_1_5_kits", "S1"), + ("novaseq_x_run_parameters", "10B"), + ], +) +def test_get_flow_cell_mode( + run_parameters_fixture: str, expected_result: str | None, request: FixtureRequest +): + """Test that the correct flow cell mode is returned for each RunParameters object.""" + # GIVEN a RunParameters object + run_parameters: RunParameters = request.getfixturevalue(run_parameters_fixture) + # WHEN getting the flow cell mode + result: str | None = run_parameters.get_flow_cell_model() + # THEN the correct flow cell mode is returned + assert result == expected_result diff --git a/tests/services/illumina_post_processing_service/conftest.py b/tests/services/illumina_post_processing_service/conftest.py index f530692a81..1c27c857de 100644 --- a/tests/services/illumina_post_processing_service/conftest.py +++ b/tests/services/illumina_post_processing_service/conftest.py @@ -3,29 +3,3 @@ from pathlib import Path import pytest - - -@pytest.fixture -def run_paramters_with_flow_cell_mode_node_name() -> Path: - return Path( - "tests", - "fixtures", - "apps", - "demultiplexing", - "flow_cells", - "230912_A00187_1009_AHK33MDRX3", - "RunParameters.xml", - ) - - -@pytest.fixture -def run_parameters_without_model() -> Path: - return Path( - "tests", - "fixtures", - "apps", - "demultiplexing", - "flow_cells", - "170517_ST-E00266_0210_BHJCFFALXX", - "runParameters.xml", - ) diff --git a/tests/services/illumina_post_processing_service/test_illumina_post_processing_utils.py b/tests/services/illumina_post_processing_service/test_illumina_post_processing_utils.py deleted file mode 100644 index b4e6ed7cc7..0000000000 --- a/tests/services/illumina_post_processing_service/test_illumina_post_processing_utils.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Test utils of the illumina post processing service.""" - -from pathlib import Path - -from cg.services.illumina_post_processing_service.utils import ( - get_flow_cell_model_from_run_parameters, -) - - -def test_get_flow_cell_model_from_run_parameters_with_mode_node_name( - novaseq_x_run_parameters_path: Path, -): - # GIVEN a run parameters file from a NovaSeq X run - - # WHEN getting the flow cell model from the run parameters file - flow_cell_model = get_flow_cell_model_from_run_parameters(novaseq_x_run_parameters_path) - - # THEN the flow cell model should be returned - assert flow_cell_model == "10B" - - -def test_get_flow_cel_model_from_run_parameters_with_flow_cell_mode_node_name( - run_paramters_with_flow_cell_mode_node_name: Path, -): - # GIVEN a run parameters file with a FlowCellMode node - - # WHEN getting the flow cell model from the run parameters file - flow_cell_model = get_flow_cell_model_from_run_parameters( - run_paramters_with_flow_cell_mode_node_name - ) - - # THEN the flow cell model should be returned - assert flow_cell_model == "S1" - - -def test_get_flow_cell_model_from_run_paramters_without_model(run_parameters_without_model: Path): - # GIVEN a run parameters file without a model - - # WHEN getting the flow cell model from the run parameters file - flow_cell_model = get_flow_cell_model_from_run_parameters(run_parameters_without_model) - - # THEN the flow cell model should be None - assert flow_cell_model is None