feat - add HiSeq RunParameters file parser (#2653)(MINOR)

Closes #2651. Creates a child class of `RunParameters` that reads files from HiSeq sequencers, both 2500 and X. This is with the purpose of knowing the reads and index reads of the sequencing. ### Added - Class `RunParametersHiSeq`, implementing abstract methods from parent. - Constants to parse the elements from the XML file - XMLError exception - Test for new class ### Changed - Moved and renamed function `node_not_found` in `RunParameters` class to `cg/io/xml.py:validate_node_exists` - Replaced RunParametersError exception for XMLError in the validation of the nodes. ### Fixed - Removed unused sample sheets in fixtures
Clinical-Genomics · Nov 30, 2023 · 8524a64 · 8524a64
1 parent 43c3a40
commit 8524a64
Show file tree

Hide file tree

Showing 29 changed files with 1,133 additions and 319 deletions.
diff --git a/cg/apps/demultiplex/sample_sheet/create.py b/cg/apps/demultiplex/sample_sheet/create.py
@@ -17,7 +17,7 @@ def get_sample_sheet_creator(
     lims_samples: list[FlowCellSample],
     force: bool,
 ) -> SampleSheetCreator:
-    """Returns an initialised sample sheet creator according to the software used for demultiplexing."""
+    """Returns an initialised sample sheet creator according to the demultiplexing software."""
     if flow_cell.bcl_converter == BclConverter.BCL2FASTQ:
         return SampleSheetCreatorBcl2Fastq(
             flow_cell=flow_cell, lims_samples=lims_samples, force=force
@@ -31,7 +31,7 @@ def create_sample_sheet(
     force: bool = False,
 ) -> list[list[str]]:
     """Create a sample sheet for a flow cell."""
-    sample_sheet_creator = get_sample_sheet_creator(
+    sample_sheet_creator: SampleSheetCreator = get_sample_sheet_creator(
         flow_cell=flow_cell,
         lims_samples=lims_samples,
         force=force,

diff --git a/cg/apps/demultiplex/sample_sheet/index.py b/cg/apps/demultiplex/sample_sheet/index.py
@@ -58,7 +58,7 @@ def get_valid_indexes(dual_indexes_only: bool = True) -> list[Index]:
 
 def get_reagent_kit_version(reagent_kit_version: str) -> str:
     """Derives the reagent kit version from the run parameters."""
-    LOG.info(f"Converting reagent kit parameter {reagent_kit_version} to version")
+    LOG.debug(f"Converting reagent kit parameter {reagent_kit_version} to version")
     if reagent_kit_version not in REAGENT_KIT_PARAMETER_TO_VERSION:
         raise SyntaxError(f"Unknown reagent kit version {reagent_kit_version}")
 
@@ -69,22 +69,22 @@ def get_index_pair(sample: FlowCellSample) -> tuple[str, str]:
     """Returns a sample index separated into index 1 and index 2."""
     if is_dual_index(sample.index):
         index_1, index_2 = sample.index.split("-")
-        return index_1.strip(), index_2.strip()
-    return sample.index, sample.index2
+        return index_1.strip().replace("NNNNNNNNN", ""), index_2.strip()
+    return sample.index.replace("NNNNNNNNN", ""), sample.index2
 
 
 def is_reverse_complement_needed(run_parameters: RunParameters) -> bool:
     """Return True if the second index requires reverse complement.
 
     If the run used the new NovaSeq control software version (NEW_CONTROL_SOFTWARE_VERSION)
     and the new reagent kit version (NEW_REAGENT_KIT_VERSION), then it requires reverse complement.
-    If the run is NovaSeqX, does not require reverse complement.
+    If the run is NovaSeqX, HiSeqX or HiSeq2500, does not require reverse complement.
     """
-    if run_parameters.sequencer == Sequencers.NOVASEQX:
+    if run_parameters.sequencer != Sequencers.NOVASEQ:
         return False
     control_software_version: str = run_parameters.control_software_version
     reagent_kit_version: str = run_parameters.reagent_kit_version
-    LOG.info("Check if run is reverse complement")
+    LOG.debug("Check if run is reverse complement")
     if version.parse(version=control_software_version) < version.parse(
         version=NEW_CONTROL_SOFTWARE_VERSION
     ):
@@ -98,7 +98,7 @@ def is_reverse_complement_needed(run_parameters: RunParameters) -> bool:
             f"Reagent kit version {reagent_kit_version} does not does not need reverse complement"
         )
         return False
-    LOG.info("Run is reverse complement")
+    LOG.debug("Run is reverse complement")
     return True
 
 
@@ -221,11 +221,17 @@ def update_indexes_for_samples(
     samples: list[FlowCellSampleBCLConvert | FlowCellSampleBcl2Fastq],
     index_cycles: int,
     is_reverse_complement: bool,
+    sequencer: str,
 ) -> None:
     """Updates the values to the fields index1 and index 2 of samples."""
     for sample in samples:
-        pad_and_reverse_complement_sample_indexes(
-            sample=sample,
-            index_cycles=index_cycles,
-            is_reverse_complement=is_reverse_complement,
-        )
+        if sequencer != Sequencers.NOVASEQ:
+            index1, index2 = get_index_pair(sample=sample)
+            sample.index = index1
+            sample.index2 = index2
+        else:
+            pad_and_reverse_complement_sample_indexes(
+                sample=sample,
+                index_cycles=index_cycles,
+                is_reverse_complement=is_reverse_complement,
+            )
diff --git a/cg/apps/demultiplex/sample_sheet/sample_sheet_creator.py b/cg/apps/demultiplex/sample_sheet/sample_sheet_creator.py
@@ -91,7 +91,6 @@ def convert_sample_to_header_dict(
         data_column_names: list[str],
     ) -> list[str]:
         """Convert a lims sample object to a list that corresponds to the sample sheet headers."""
-        LOG.debug(f"Use sample sheet header {data_column_names}")
         sample_dict = sample.model_dump(by_alias=True)
         return [str(sample_dict[column]) for column in data_column_names]
 
@@ -106,14 +105,16 @@ def get_data_section_header_and_columns(self) -> list[list[str]] | None:
     def create_sample_sheet_content(self) -> list[list[str]]:
         """Create sample sheet content with samples."""
         LOG.info("Creating sample sheet content")
+        complete_data_section: list[list[str]] = self.get_data_section_header_and_columns()
         sample_sheet_content: list[list[str]] = (
-            self.get_additional_sections_sample_sheet() + self.get_data_section_header_and_columns()
+            self.get_additional_sections_sample_sheet() + complete_data_section
         )
+        LOG.debug(f"Use sample sheet header {complete_data_section[1]}")
         for sample in self.lims_samples:
             sample_sheet_content.append(
                 self.convert_sample_to_header_dict(
                     sample=sample,
-                    data_column_names=self.get_data_section_header_and_columns()[1],
+                    data_column_names=complete_data_section[1],
                 )
             )
         return sample_sheet_content
@@ -129,6 +130,7 @@ def process_samples_for_sample_sheet(self) -> None:
                 samples=samples_in_lane,
                 index_cycles=self.run_parameters.index_length,
                 is_reverse_complement=self.is_reverse_complement,
+                sequencer=self.run_parameters.sequencer,
             )
             self.update_barcode_mismatch_values_for_samples(samples_in_lane)
 
@@ -157,7 +159,7 @@ def update_barcode_mismatch_values_for_samples(self, *args) -> None:
 
     def add_override_cycles_to_samples(self) -> None:
         """Return None for flow cells to be demultiplexed with Bcl2fastq."""
-        LOG.debug("No adding of override cycles for Bcl2fastq flow cell")
+        LOG.debug("Skipping adding of override cycles for Bcl2fastq flow cell")
 
     def get_additional_sections_sample_sheet(self) -> list[list[str]]:
         """Return all sections of the sample sheet that are not the data section."""
@@ -201,19 +203,22 @@ def update_barcode_mismatch_values_for_samples(
 
     def add_override_cycles_to_samples(self) -> None:
         """Add override cycles attribute to samples."""
-        flow_cell_index_len: int = self.run_parameters.index_length
         read1_cycles: str = f"Y{self.run_parameters.get_read_1_cycles()};"
         read2_cycles: str = f"Y{self.run_parameters.get_read_2_cycles()}"
+        length_index1: int = self.run_parameters.get_index_1_cycles()
+        length_index2: int = self.run_parameters.get_index_2_cycles()
         for sample in self.lims_samples:
-            index1_cycles: str = f"I{self.run_parameters.get_index_1_cycles()};"
-            index2_cycles: str = f"I{self.run_parameters.get_index_2_cycles()};"
-            sample_index_len: int = len(get_index_pair(sample)[0])
-            if sample_index_len < flow_cell_index_len:
-                index1_cycles = f"I{sample_index_len}N{flow_cell_index_len - sample_index_len};"
+            index1_cycles: str = f"I{length_index1};"
+            index2_cycles: str = f"I{length_index2};"
+            sample_index1_len: int = len(get_index_pair(sample)[0])
+            sample_index2_len: int = len(get_index_pair(sample)[1])
+            if sample_index1_len < length_index1:
+                index1_cycles = f"I{sample_index1_len}N{length_index1 - sample_index1_len};"
+            if sample_index2_len < length_index2:
                 index2_cycles = (
-                    f"I{sample_index_len}N{flow_cell_index_len - sample_index_len};"
+                    f"I{sample_index2_len}N{length_index2 - sample_index2_len};"
                     if self.is_reverse_complement
-                    else f"N{flow_cell_index_len - sample_index_len}I{sample_index_len};"
+                    else f"N{length_index2 - sample_index2_len}I{sample_index2_len};"
                 )
             sample.override_cycles = read1_cycles + index1_cycles + index2_cycles + read2_cycles
 
@@ -232,21 +237,21 @@ def get_additional_sections_sample_sheet(self) -> list[list[str]]:
             SampleSheetBCLConvertSections.Header.INDEX_ORIENTATION_FORWARD.value,
         ]
         reads_section: list[list[str]] = [
-            [SampleSheetBCLConvertSections.Reads.HEADER.value],
+            [SampleSheetBCLConvertSections.Reads.HEADER],
             [
-                SampleSheetBCLConvertSections.Reads.READ_CYCLES_1.value,
+                SampleSheetBCLConvertSections.Reads.READ_CYCLES_1,
                 self.run_parameters.get_read_1_cycles(),
             ],
             [
-                SampleSheetBCLConvertSections.Reads.READ_CYCLES_2.value,
+                SampleSheetBCLConvertSections.Reads.READ_CYCLES_2,
                 self.run_parameters.get_read_2_cycles(),
             ],
             [
-                SampleSheetBCLConvertSections.Reads.INDEX_CYCLES_1.value,
+                SampleSheetBCLConvertSections.Reads.INDEX_CYCLES_1,
                 self.run_parameters.get_index_1_cycles(),
             ],
             [
-                SampleSheetBCLConvertSections.Reads.INDEX_CYCLES_2.value,
+                SampleSheetBCLConvertSections.Reads.INDEX_CYCLES_2,
                 self.run_parameters.get_index_2_cycles(),
             ],
         ]

diff --git a/cg/cli/demultiplex/sample_sheet.py b/cg/cli/demultiplex/sample_sheet.py
@@ -7,18 +7,14 @@
 
 from cg.apps.demultiplex.sample_sheet.create import create_sample_sheet
 from cg.apps.demultiplex.sample_sheet.models import FlowCellSample
-from cg.apps.demultiplex.sample_sheet.read_sample_sheet import (
-    get_sample_sheet_from_file,
-)
+from cg.apps.demultiplex.sample_sheet.read_sample_sheet import get_sample_sheet_from_file
 from cg.apps.housekeeper.hk import HousekeeperAPI
 from cg.apps.lims.sample_sheet import get_flow_cell_samples
 from cg.constants.constants import DRY_RUN, FileFormat
 from cg.constants.demultiplexing import OPTION_BCL_CONVERTER
 from cg.exc import FlowCellError, HousekeeperFileMissingError
 from cg.io.controller import WriteFile, WriteStream
-from cg.meta.demultiplex.housekeeper_storage_functions import (
-    add_sample_sheet_path_to_housekeeper,
-)
+from cg.meta.demultiplex.housekeeper_storage_functions import add_sample_sheet_path_to_housekeeper
 from cg.models.cg_config import CGConfig
 from cg.models.flow_cell.flow_cell import FlowCellDirectoryData
 

diff --git a/cg/constants/demultiplexing.py b/cg/constants/demultiplexing.py
@@ -22,10 +22,11 @@ class DemultiplexingDirsAndFiles(StrEnum):
     DELIVERY: str = "delivery.txt"
     DEMUX_STARTED: str = "demuxstarted.txt"
     DEMUX_COMPLETE: str = "demuxcomplete.txt"
-    Hiseq_X_COPY_COMPLETE: str = "copycomplete.txt"
-    Hiseq_X_TILE_DIR: str = "l1t11"
+    HISEQ_X_COPY_COMPLETE: str = "copycomplete.txt"
+    HISEQ_X_TILE_DIR: str = "l1t11"
     RTACOMPLETE: str = "RTAComplete.txt"
-    RUN_PARAMETERS: str = "RunParameters.xml"
+    RUN_PARAMETERS_PASCAL_CASE: str = "RunParameters.xml"
+    RUN_PARAMETERS_CAMEL_CASE: str = "runParameters.xml"
     SAMPLE_SHEET_FILE_NAME: str = "SampleSheet.csv"
     UNALIGNED_DIR_NAME: str = "Unaligned"
     BCL2FASTQ_TILE_DIR_PATTERN: str = r"l\dt\d{2}"
@@ -46,23 +47,31 @@ class RunParametersXMLNodes(StrEnum):
 
     # Node names
     APPLICATION: str = ".Application"
+    APPLICATION_NAME: str = ".//ApplicationName"
     APPLICATION_VERSION: str = ".ApplicationVersion"
     CYCLES: str = "Cycles"
+    INDEX_1_HISEQ: str = ".//IndexRead1"
+    INDEX_2_HISEQ: str = ".//IndexRead2"
     INDEX_1_NOVASEQ_6000: str = "./IndexRead1NumberOfCycles"
     INDEX_2_NOVASEQ_6000: str = "./IndexRead2NumberOfCycles"
     INDEX_1_NOVASEQ_X: str = "Index1"
     INDEX_2_NOVASEQ_X: str = "Index2"
     INNER_READ: str = ".//Read"
     INSTRUMENT_TYPE: str = ".InstrumentType"
-    PLANNED_READS: str = "./PlannedReads"
+    PLANNED_READS_HISEQ: str = ".//Reads"
+    PLANNED_READS_NOVASEQ_X: str = "./PlannedReads"
+    READ_1_HISEQ: str = ".//Read1"
+    READ_2_HISEQ: str = ".//Read2"
     READ_1_NOVASEQ_6000: str = "./Read1NumberOfCycles"
     READ_2_NOVASEQ_6000: str = "./Read2NumberOfCycles"
     READ_1_NOVASEQ_X: str = "Read1"
     READ_2_NOVASEQ_X: str = "Read2"
     READ_NAME: str = "ReadName"
     REAGENT_KIT_VERSION: str = "./RfidsInfo/SbsConsumableVersion"
+    SEQUENCER_ID: str = ".//ScannerID"
 
     # Node Values
+    HISEQ_APPLICATION: str = "HiSeq Control Software"
     NOVASEQ_6000_APPLICATION: str = "NovaSeq Control Software"
     NOVASEQ_X_INSTRUMENT: str = "NovaSeqXPlus"
     UNKNOWN_REAGENT_KIT_VERSION: str = "unknown"

diff --git a/cg/constants/sequencing.py b/cg/constants/sequencing.py
@@ -13,7 +13,7 @@ class Sequencers(StrEnum):
     OTHER: str = "other"
 
 
-sequencer_types = {
+SEQUENCER_TYPES = {
     "D00134": Sequencers.HISEQGA,
     "D00410": Sequencers.HISEQGA,
     "D00415": Sequencers.HISEQGA,

diff --git a/cg/exc.py b/cg/exc.py
@@ -228,3 +228,7 @@ class MetricsQCError(CgError):
 
 class MissingMetrics(CgError):
     """Exception raised when mandatory metrics are missing."""
+
+
+class XMLError(CgError):
+    """Exception raised when something is wrong with the content of an XML file."""
diff --git a/cg/io/xml.py b/cg/io/xml.py
@@ -1,19 +1,40 @@
 """Module for reading and writing xml files."""
-
-import xml.etree.ElementTree as ET
+import logging
 from pathlib import Path
+from xml.etree.ElementTree import Element, ElementTree, parse
 
 from cg.constants import FileExtensions
+from cg.exc import XMLError
 from cg.io.validate_path import validate_file_suffix
 
+LOG = logging.getLogger(__name__)
+
 
-def read_xml(file_path: Path) -> ET.ElementTree:
+def read_xml(file_path: Path) -> ElementTree:
     """Read content in a xml file to an ElementTree."""
     validate_file_suffix(path_to_validate=file_path, target_suffix=FileExtensions.XML)
-    tree = ET.parse(file_path)
+    tree = parse(file_path)
     return tree
 
 
-def write_xml(tree: ET.ElementTree, file_path: Path) -> None:
+def write_xml(tree: ElementTree, file_path: Path) -> None:
     """Write content to a xml file."""
     tree.write(file_path, encoding="utf-8", xml_declaration=True)
+
+
+def validate_node_exists(node: Element | None, name: str) -> None:
+    """Validates if the given node is not None.
+    Raises:
+        XMLError: If the node is None
+    """
+    if node is None:
+        message = f"Could not find node with name {name} in XML tree"
+        LOG.warning(message)
+        raise XMLError(message)
+
+
+def get_tree_node(tree: ElementTree, node_name: str) -> Element:
+    """Return the node of a tree given its name if it exists."""
+    xml_node: Element = tree.find(node_name)
+    validate_node_exists(node=xml_node, name=node_name)
+    return xml_node