Skip to content

Commit

Permalink
feat - add HiSeq RunParameters file parser (#2653)(MINOR)
Browse files Browse the repository at this point in the history
Closes #2651. Creates a child class of `RunParameters` that reads files from HiSeq sequencers, both 2500 and X. This is with the purpose of knowing the reads and index reads of the sequencing.

### Added

- Class `RunParametersHiSeq`, implementing abstract methods from parent.
- Constants to parse the elements from the XML file
- XMLError exception
- Test for new class

### Changed

- Moved and renamed function `node_not_found` in `RunParameters` class to `cg/io/xml.py:validate_node_exists`
- Replaced RunParametersError exception for XMLError in the validation of the nodes.

### Fixed

- Removed unused sample sheets in fixtures
  • Loading branch information
diitaz93 authored Nov 30, 2023
1 parent 43c3a40 commit 8524a64
Show file tree
Hide file tree
Showing 29 changed files with 1,133 additions and 319 deletions.
4 changes: 2 additions & 2 deletions cg/apps/demultiplex/sample_sheet/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_sample_sheet_creator(
lims_samples: list[FlowCellSample],
force: bool,
) -> SampleSheetCreator:
"""Returns an initialised sample sheet creator according to the software used for demultiplexing."""
"""Returns an initialised sample sheet creator according to the demultiplexing software."""
if flow_cell.bcl_converter == BclConverter.BCL2FASTQ:
return SampleSheetCreatorBcl2Fastq(
flow_cell=flow_cell, lims_samples=lims_samples, force=force
Expand All @@ -31,7 +31,7 @@ def create_sample_sheet(
force: bool = False,
) -> list[list[str]]:
"""Create a sample sheet for a flow cell."""
sample_sheet_creator = get_sample_sheet_creator(
sample_sheet_creator: SampleSheetCreator = get_sample_sheet_creator(
flow_cell=flow_cell,
lims_samples=lims_samples,
force=force,
Expand Down
30 changes: 18 additions & 12 deletions cg/apps/demultiplex/sample_sheet/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_valid_indexes(dual_indexes_only: bool = True) -> list[Index]:

def get_reagent_kit_version(reagent_kit_version: str) -> str:
"""Derives the reagent kit version from the run parameters."""
LOG.info(f"Converting reagent kit parameter {reagent_kit_version} to version")
LOG.debug(f"Converting reagent kit parameter {reagent_kit_version} to version")
if reagent_kit_version not in REAGENT_KIT_PARAMETER_TO_VERSION:
raise SyntaxError(f"Unknown reagent kit version {reagent_kit_version}")

Expand All @@ -69,22 +69,22 @@ def get_index_pair(sample: FlowCellSample) -> tuple[str, str]:
"""Returns a sample index separated into index 1 and index 2."""
if is_dual_index(sample.index):
index_1, index_2 = sample.index.split("-")
return index_1.strip(), index_2.strip()
return sample.index, sample.index2
return index_1.strip().replace("NNNNNNNNN", ""), index_2.strip()
return sample.index.replace("NNNNNNNNN", ""), sample.index2


def is_reverse_complement_needed(run_parameters: RunParameters) -> bool:
"""Return True if the second index requires reverse complement.
If the run used the new NovaSeq control software version (NEW_CONTROL_SOFTWARE_VERSION)
and the new reagent kit version (NEW_REAGENT_KIT_VERSION), then it requires reverse complement.
If the run is NovaSeqX, does not require reverse complement.
If the run is NovaSeqX, HiSeqX or HiSeq2500, does not require reverse complement.
"""
if run_parameters.sequencer == Sequencers.NOVASEQX:
if run_parameters.sequencer != Sequencers.NOVASEQ:
return False
control_software_version: str = run_parameters.control_software_version
reagent_kit_version: str = run_parameters.reagent_kit_version
LOG.info("Check if run is reverse complement")
LOG.debug("Check if run is reverse complement")
if version.parse(version=control_software_version) < version.parse(
version=NEW_CONTROL_SOFTWARE_VERSION
):
Expand All @@ -98,7 +98,7 @@ def is_reverse_complement_needed(run_parameters: RunParameters) -> bool:
f"Reagent kit version {reagent_kit_version} does not does not need reverse complement"
)
return False
LOG.info("Run is reverse complement")
LOG.debug("Run is reverse complement")
return True


Expand Down Expand Up @@ -221,11 +221,17 @@ def update_indexes_for_samples(
samples: list[FlowCellSampleBCLConvert | FlowCellSampleBcl2Fastq],
index_cycles: int,
is_reverse_complement: bool,
sequencer: str,
) -> None:
"""Updates the values to the fields index1 and index 2 of samples."""
for sample in samples:
pad_and_reverse_complement_sample_indexes(
sample=sample,
index_cycles=index_cycles,
is_reverse_complement=is_reverse_complement,
)
if sequencer != Sequencers.NOVASEQ:
index1, index2 = get_index_pair(sample=sample)
sample.index = index1
sample.index2 = index2
else:
pad_and_reverse_complement_sample_indexes(
sample=sample,
index_cycles=index_cycles,
is_reverse_complement=is_reverse_complement,
)
39 changes: 22 additions & 17 deletions cg/apps/demultiplex/sample_sheet/sample_sheet_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ def convert_sample_to_header_dict(
data_column_names: list[str],
) -> list[str]:
"""Convert a lims sample object to a list that corresponds to the sample sheet headers."""
LOG.debug(f"Use sample sheet header {data_column_names}")
sample_dict = sample.model_dump(by_alias=True)
return [str(sample_dict[column]) for column in data_column_names]

Expand All @@ -106,14 +105,16 @@ def get_data_section_header_and_columns(self) -> list[list[str]] | None:
def create_sample_sheet_content(self) -> list[list[str]]:
"""Create sample sheet content with samples."""
LOG.info("Creating sample sheet content")
complete_data_section: list[list[str]] = self.get_data_section_header_and_columns()
sample_sheet_content: list[list[str]] = (
self.get_additional_sections_sample_sheet() + self.get_data_section_header_and_columns()
self.get_additional_sections_sample_sheet() + complete_data_section
)
LOG.debug(f"Use sample sheet header {complete_data_section[1]}")
for sample in self.lims_samples:
sample_sheet_content.append(
self.convert_sample_to_header_dict(
sample=sample,
data_column_names=self.get_data_section_header_and_columns()[1],
data_column_names=complete_data_section[1],
)
)
return sample_sheet_content
Expand All @@ -129,6 +130,7 @@ def process_samples_for_sample_sheet(self) -> None:
samples=samples_in_lane,
index_cycles=self.run_parameters.index_length,
is_reverse_complement=self.is_reverse_complement,
sequencer=self.run_parameters.sequencer,
)
self.update_barcode_mismatch_values_for_samples(samples_in_lane)

Expand Down Expand Up @@ -157,7 +159,7 @@ def update_barcode_mismatch_values_for_samples(self, *args) -> None:

def add_override_cycles_to_samples(self) -> None:
"""Return None for flow cells to be demultiplexed with Bcl2fastq."""
LOG.debug("No adding of override cycles for Bcl2fastq flow cell")
LOG.debug("Skipping adding of override cycles for Bcl2fastq flow cell")

def get_additional_sections_sample_sheet(self) -> list[list[str]]:
"""Return all sections of the sample sheet that are not the data section."""
Expand Down Expand Up @@ -201,19 +203,22 @@ def update_barcode_mismatch_values_for_samples(

def add_override_cycles_to_samples(self) -> None:
"""Add override cycles attribute to samples."""
flow_cell_index_len: int = self.run_parameters.index_length
read1_cycles: str = f"Y{self.run_parameters.get_read_1_cycles()};"
read2_cycles: str = f"Y{self.run_parameters.get_read_2_cycles()}"
length_index1: int = self.run_parameters.get_index_1_cycles()
length_index2: int = self.run_parameters.get_index_2_cycles()
for sample in self.lims_samples:
index1_cycles: str = f"I{self.run_parameters.get_index_1_cycles()};"
index2_cycles: str = f"I{self.run_parameters.get_index_2_cycles()};"
sample_index_len: int = len(get_index_pair(sample)[0])
if sample_index_len < flow_cell_index_len:
index1_cycles = f"I{sample_index_len}N{flow_cell_index_len - sample_index_len};"
index1_cycles: str = f"I{length_index1};"
index2_cycles: str = f"I{length_index2};"
sample_index1_len: int = len(get_index_pair(sample)[0])
sample_index2_len: int = len(get_index_pair(sample)[1])
if sample_index1_len < length_index1:
index1_cycles = f"I{sample_index1_len}N{length_index1 - sample_index1_len};"
if sample_index2_len < length_index2:
index2_cycles = (
f"I{sample_index_len}N{flow_cell_index_len - sample_index_len};"
f"I{sample_index2_len}N{length_index2 - sample_index2_len};"
if self.is_reverse_complement
else f"N{flow_cell_index_len - sample_index_len}I{sample_index_len};"
else f"N{length_index2 - sample_index2_len}I{sample_index2_len};"
)
sample.override_cycles = read1_cycles + index1_cycles + index2_cycles + read2_cycles

Expand All @@ -232,21 +237,21 @@ def get_additional_sections_sample_sheet(self) -> list[list[str]]:
SampleSheetBCLConvertSections.Header.INDEX_ORIENTATION_FORWARD.value,
]
reads_section: list[list[str]] = [
[SampleSheetBCLConvertSections.Reads.HEADER.value],
[SampleSheetBCLConvertSections.Reads.HEADER],
[
SampleSheetBCLConvertSections.Reads.READ_CYCLES_1.value,
SampleSheetBCLConvertSections.Reads.READ_CYCLES_1,
self.run_parameters.get_read_1_cycles(),
],
[
SampleSheetBCLConvertSections.Reads.READ_CYCLES_2.value,
SampleSheetBCLConvertSections.Reads.READ_CYCLES_2,
self.run_parameters.get_read_2_cycles(),
],
[
SampleSheetBCLConvertSections.Reads.INDEX_CYCLES_1.value,
SampleSheetBCLConvertSections.Reads.INDEX_CYCLES_1,
self.run_parameters.get_index_1_cycles(),
],
[
SampleSheetBCLConvertSections.Reads.INDEX_CYCLES_2.value,
SampleSheetBCLConvertSections.Reads.INDEX_CYCLES_2,
self.run_parameters.get_index_2_cycles(),
],
]
Expand Down
8 changes: 2 additions & 6 deletions cg/cli/demultiplex/sample_sheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,14 @@

from cg.apps.demultiplex.sample_sheet.create import create_sample_sheet
from cg.apps.demultiplex.sample_sheet.models import FlowCellSample
from cg.apps.demultiplex.sample_sheet.read_sample_sheet import (
get_sample_sheet_from_file,
)
from cg.apps.demultiplex.sample_sheet.read_sample_sheet import get_sample_sheet_from_file
from cg.apps.housekeeper.hk import HousekeeperAPI
from cg.apps.lims.sample_sheet import get_flow_cell_samples
from cg.constants.constants import DRY_RUN, FileFormat
from cg.constants.demultiplexing import OPTION_BCL_CONVERTER
from cg.exc import FlowCellError, HousekeeperFileMissingError
from cg.io.controller import WriteFile, WriteStream
from cg.meta.demultiplex.housekeeper_storage_functions import (
add_sample_sheet_path_to_housekeeper,
)
from cg.meta.demultiplex.housekeeper_storage_functions import add_sample_sheet_path_to_housekeeper
from cg.models.cg_config import CGConfig
from cg.models.flow_cell.flow_cell import FlowCellDirectoryData

Expand Down
17 changes: 13 additions & 4 deletions cg/constants/demultiplexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@ class DemultiplexingDirsAndFiles(StrEnum):
DELIVERY: str = "delivery.txt"
DEMUX_STARTED: str = "demuxstarted.txt"
DEMUX_COMPLETE: str = "demuxcomplete.txt"
Hiseq_X_COPY_COMPLETE: str = "copycomplete.txt"
Hiseq_X_TILE_DIR: str = "l1t11"
HISEQ_X_COPY_COMPLETE: str = "copycomplete.txt"
HISEQ_X_TILE_DIR: str = "l1t11"
RTACOMPLETE: str = "RTAComplete.txt"
RUN_PARAMETERS: str = "RunParameters.xml"
RUN_PARAMETERS_PASCAL_CASE: str = "RunParameters.xml"
RUN_PARAMETERS_CAMEL_CASE: str = "runParameters.xml"
SAMPLE_SHEET_FILE_NAME: str = "SampleSheet.csv"
UNALIGNED_DIR_NAME: str = "Unaligned"
BCL2FASTQ_TILE_DIR_PATTERN: str = r"l\dt\d{2}"
Expand All @@ -46,23 +47,31 @@ class RunParametersXMLNodes(StrEnum):

# Node names
APPLICATION: str = ".Application"
APPLICATION_NAME: str = ".//ApplicationName"
APPLICATION_VERSION: str = ".ApplicationVersion"
CYCLES: str = "Cycles"
INDEX_1_HISEQ: str = ".//IndexRead1"
INDEX_2_HISEQ: str = ".//IndexRead2"
INDEX_1_NOVASEQ_6000: str = "./IndexRead1NumberOfCycles"
INDEX_2_NOVASEQ_6000: str = "./IndexRead2NumberOfCycles"
INDEX_1_NOVASEQ_X: str = "Index1"
INDEX_2_NOVASEQ_X: str = "Index2"
INNER_READ: str = ".//Read"
INSTRUMENT_TYPE: str = ".InstrumentType"
PLANNED_READS: str = "./PlannedReads"
PLANNED_READS_HISEQ: str = ".//Reads"
PLANNED_READS_NOVASEQ_X: str = "./PlannedReads"
READ_1_HISEQ: str = ".//Read1"
READ_2_HISEQ: str = ".//Read2"
READ_1_NOVASEQ_6000: str = "./Read1NumberOfCycles"
READ_2_NOVASEQ_6000: str = "./Read2NumberOfCycles"
READ_1_NOVASEQ_X: str = "Read1"
READ_2_NOVASEQ_X: str = "Read2"
READ_NAME: str = "ReadName"
REAGENT_KIT_VERSION: str = "./RfidsInfo/SbsConsumableVersion"
SEQUENCER_ID: str = ".//ScannerID"

# Node Values
HISEQ_APPLICATION: str = "HiSeq Control Software"
NOVASEQ_6000_APPLICATION: str = "NovaSeq Control Software"
NOVASEQ_X_INSTRUMENT: str = "NovaSeqXPlus"
UNKNOWN_REAGENT_KIT_VERSION: str = "unknown"
Expand Down
2 changes: 1 addition & 1 deletion cg/constants/sequencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Sequencers(StrEnum):
OTHER: str = "other"


sequencer_types = {
SEQUENCER_TYPES = {
"D00134": Sequencers.HISEQGA,
"D00410": Sequencers.HISEQGA,
"D00415": Sequencers.HISEQGA,
Expand Down
4 changes: 4 additions & 0 deletions cg/exc.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,7 @@ class MetricsQCError(CgError):

class MissingMetrics(CgError):
"""Exception raised when mandatory metrics are missing."""


class XMLError(CgError):
"""Exception raised when something is wrong with the content of an XML file."""
31 changes: 26 additions & 5 deletions cg/io/xml.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,40 @@
"""Module for reading and writing xml files."""

import xml.etree.ElementTree as ET
import logging
from pathlib import Path
from xml.etree.ElementTree import Element, ElementTree, parse

from cg.constants import FileExtensions
from cg.exc import XMLError
from cg.io.validate_path import validate_file_suffix

LOG = logging.getLogger(__name__)


def read_xml(file_path: Path) -> ET.ElementTree:
def read_xml(file_path: Path) -> ElementTree:
"""Read content in a xml file to an ElementTree."""
validate_file_suffix(path_to_validate=file_path, target_suffix=FileExtensions.XML)
tree = ET.parse(file_path)
tree = parse(file_path)
return tree


def write_xml(tree: ET.ElementTree, file_path: Path) -> None:
def write_xml(tree: ElementTree, file_path: Path) -> None:
"""Write content to a xml file."""
tree.write(file_path, encoding="utf-8", xml_declaration=True)


def validate_node_exists(node: Element | None, name: str) -> None:
"""Validates if the given node is not None.
Raises:
XMLError: If the node is None
"""
if node is None:
message = f"Could not find node with name {name} in XML tree"
LOG.warning(message)
raise XMLError(message)


def get_tree_node(tree: ElementTree, node_name: str) -> Element:
"""Return the node of a tree given its name if it exists."""
xml_node: Element = tree.find(node_name)
validate_node_exists(node=xml_node, name=node_name)
return xml_node
Loading

0 comments on commit 8524a64

Please sign in to comment.