Skip to content

Commit

Permalink
Re-route flow to illumina device tables. (#3349)(major)
Browse files Browse the repository at this point in the history
# Description

Feature branch to re-route the post-processing database flow from the `Flowcell` -> `SampleLaneSequencingMetrics` model paradigm to `IlluminaFlowCell` -> `IlluminaSequencingRun` -> `IlluminaSampleSequencingMetrics`. 
The choice to use a feature branch is to collect all changes and make sure they are tested. Since we need to alter core logic that requires a database migration we cannot do these step by step.

PR in servers: Clinical-Genomics/servers#1394
Data migration script: Clinical-Genomics/add-new-tech#7

## Deploy
```shell
bash /home/proj/production/servers/resources/hasta.scilifelab.se/update-tool-stage.sh -e S_cg -t cg -b develop-illumina-devices -a
```

## CLI command changes
Commands that need to be updated in servers

- [x] `cg clean flow-cells` -> `cg clean illumina-runs` (requires servers update, PR: <placeholder>)
- [x] `cg backup encrypt-flow-cells` -> `cg backup encrypt-illumina-runs` (requires servers update, PR: <placeholder>)
- [x] `cg backup flow-cells` -> `cg backup illumina-runs` (requires servers update, PR: <placeholder>)
- [x] `cg backup fetch-flow-cell`-> `cg backup fetch-illumina-run` (no update needed)
- [x] `cg demultiplex finish flow-cell` -> `cg demultiplex finish illumina-run` (no update needed in servers
- [x] `cg get flow-cell` -> `cg get sequencing-run` (Does not require a PR in servers)
- [x] `cg set flowcell` -> `cg set sequencing-run` (Does not require a PR in servers)
- [x] `cg store flow-cell`-> `cg store illumina-run` (Does not require a PR in servers)
- [x] `cg store demultiplexed-flow-cell` -> `cg store demultiplexed-run` (Does not require a PR in servers)
- [x] `cg decompress flow-cell` -> `cg decompress illumina-run` (Does not require a PR in servers)
  • Loading branch information
ChrOertlin authored Jul 10, 2024
1 parent e29050b commit e65ae73
Show file tree
Hide file tree
Showing 166 changed files with 5,676 additions and 8,939 deletions.
40 changes: 21 additions & 19 deletions cg/apps/demultiplex/sample_sheet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@

import click

from cg.apps.demultiplex.sample_sheet.read_sample_sheet import get_flow_cell_samples_from_content
from cg.apps.demultiplex.sample_sheet.sample_models import FlowCellSample
from cg.apps.demultiplex.sample_sheet.read_sample_sheet import get_samples_from_content
from cg.apps.demultiplex.sample_sheet.sample_models import IlluminaSampleIndexSetting
from cg.apps.demultiplex.sample_sheet.sample_sheet_creator import SampleSheetCreator
from cg.apps.demultiplex.sample_sheet.sample_sheet_validator import SampleSheetValidator
from cg.apps.demultiplex.sample_sheet.utils import (
delete_sample_sheet_from_housekeeper,
add_and_include_sample_sheet_path_to_housekeeper,
)
from cg.apps.housekeeper.hk import HousekeeperAPI
from cg.apps.lims import LimsAPI
from cg.apps.lims.sample_sheet import get_flow_cell_samples
Expand All @@ -22,17 +26,13 @@
SampleSheetFormatError,
)
from cg.io.controller import ReadFile, WriteFile, WriteStream
from cg.meta.demultiplex.housekeeper_storage_functions import (
add_and_include_sample_sheet_path_to_housekeeper,
delete_sample_sheet_from_housekeeper,
)
from cg.models.run_devices.illumina_run_directory_data import IlluminaRunDirectoryData
from cg.utils.files import get_directories_in_path, link_or_overwrite_file

LOG = logging.getLogger(__name__)


class SampleSheetAPI:
class IlluminaSampleSheetService:
"""Sample Sheet API class."""

def __init__(self, flow_cell_dir: str, hk_api: HousekeeperAPI, lims_api: LimsAPI) -> None:
Expand Down Expand Up @@ -115,19 +115,19 @@ def _replace_sample_header(sample_sheet_content: list[list[str]]) -> list[list[s

def translate_sample_sheet(self, flow_cell_name: str) -> None:
"""Translate a Bcl2Fastq sample sheet to a BCLConvert sample sheet."""
flow_cell: IlluminaRunDirectoryData = self._get_flow_cell(flow_cell_name)
if not self._are_necessary_files_in_flow_cell(flow_cell):
run_directory_data: IlluminaRunDirectoryData = self._get_flow_cell(flow_cell_name)
if not self._are_necessary_files_in_flow_cell(run_directory_data):
raise MissingFilesError("Missing necessary files in run directory for translation")
original_content: list[list[str]] = ReadFile.get_content_from_file(
file_format=FileFormat.CSV, file_path=flow_cell.sample_sheet_path
file_format=FileFormat.CSV, file_path=run_directory_data.sample_sheet_path
)
content_with_fixed_header: list[list[str]] = self._replace_sample_header(original_content)

flow_cell_samples: list[FlowCellSample] = get_flow_cell_samples_from_content(
samples: list[IlluminaSampleIndexSetting] = get_samples_from_content(
sample_sheet_content=content_with_fixed_header
)
bcl_convert_creator = SampleSheetCreator(
flow_cell=flow_cell, lims_samples=flow_cell_samples
run_directory_data=run_directory_data, samples=samples
)
new_content = bcl_convert_creator.construct_sample_sheet()
self.validator.validate_sample_sheet_from_content(new_content)
Expand All @@ -141,15 +141,17 @@ def translate_sample_sheet(self, flow_cell_name: str) -> None:
WriteFile.write_file_from_content(
content=new_content,
file_format=FileFormat.CSV,
file_path=flow_cell.sample_sheet_path,
file_path=run_directory_data.sample_sheet_path,
)

def _use_sample_sheet_from_housekeeper(self, flow_cell: IlluminaRunDirectoryData) -> None:
def _use_sample_sheet_from_housekeeper(
self, run_directory_data: IlluminaRunDirectoryData
) -> None:
"""
Copy the sample sheet from Housekeeper to the flow cell directory if it exists and is valid.
"""
sample_sheet_path: Path = self.hk_api.get_sample_sheet_path(flow_cell.id)
flow_cell.set_sample_sheet_path_hk(sample_sheet_path)
sample_sheet_path: Path = self.hk_api.get_sample_sheet_path(run_directory_data.id)
run_directory_data.set_sample_sheet_path_hk(sample_sheet_path)
self.validate_sample_sheet(sample_sheet_path)

if self.dry_run:
Expand All @@ -159,7 +161,7 @@ def _use_sample_sheet_from_housekeeper(self, flow_cell: IlluminaRunDirectoryData
)
return
LOG.info("Sample sheet from Housekeeper is valid. Copying it to sequencing run directory")
link_or_overwrite_file(src=sample_sheet_path, dst=flow_cell.sample_sheet_path)
link_or_overwrite_file(src=sample_sheet_path, dst=run_directory_data.sample_sheet_path)

def _use_flow_cell_sample_sheet(self, flow_cell: IlluminaRunDirectoryData) -> None:
"""Use the sample sheet from the flow cell directory if it is valid."""
Expand All @@ -186,7 +188,7 @@ def _get_sample_sheet_content(self, flow_cell: IlluminaRunDirectoryData) -> list
Raises:
LimsDataError: If no samples are found in LIMS for the flow cell.
"""
lims_samples: list[FlowCellSample] = list(
lims_samples: list[IlluminaSampleIndexSetting] = list(
get_flow_cell_samples(
lims=self.lims_api,
flow_cell_id=flow_cell.id,
Expand All @@ -196,7 +198,7 @@ def _get_sample_sheet_content(self, flow_cell: IlluminaRunDirectoryData) -> list
message: str = f"Could not find any samples in LIMS for {flow_cell.id}"
LOG.warning(message)
raise LimsDataError(message)
creator = SampleSheetCreator(flow_cell=flow_cell, lims_samples=lims_samples)
creator = SampleSheetCreator(run_directory_data=flow_cell, samples=lims_samples)
LOG.info(
f"Constructing sample sheet for the {flow_cell.sequencer_type} flow cell {flow_cell.id}"
)
Expand Down
22 changes: 11 additions & 11 deletions cg/apps/demultiplex/sample_sheet/read_sample_sheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

from pydantic import TypeAdapter

from cg.apps.demultiplex.sample_sheet.sample_models import FlowCellSample
from cg.apps.demultiplex.sample_sheet.sample_models import IlluminaSampleIndexSetting
from cg.constants.demultiplexing import SampleSheetBcl2FastqSections, SampleSheetBCLConvertSections
from cg.exc import SampleSheetContentError, SampleSheetFormatError

LOG = logging.getLogger(__name__)


def validate_samples_are_unique(samples: list[FlowCellSample]) -> None:
def validate_samples_are_unique(samples: list[IlluminaSampleIndexSetting]) -> None:
"""Validate that each sample only exists once."""
sample_ids: set = set()
for sample in samples:
Expand All @@ -21,9 +21,9 @@ def validate_samples_are_unique(samples: list[FlowCellSample]) -> None:
sample_ids.add(sample_id)


def validate_samples_unique_per_lane(samples: list[FlowCellSample]) -> None:
def validate_samples_unique_per_lane(samples: list[IlluminaSampleIndexSetting]) -> None:
"""Validate that each sample only exists once per lane in a sample sheet."""
sample_by_lane: dict[int, list[FlowCellSample]] = get_samples_by_lane(samples)
sample_by_lane: dict[int, list[IlluminaSampleIndexSetting]] = get_samples_by_lane(samples)
for lane, lane_samples in sample_by_lane.items():
LOG.debug(f"Validate that samples are unique in lane: {lane}")
validate_samples_are_unique(samples=lane_samples)
Expand Down Expand Up @@ -59,28 +59,28 @@ def get_raw_samples_from_content(sample_sheet_content: list[list[str]]) -> list[


def get_samples_by_lane(
samples: list[FlowCellSample],
) -> dict[int, list[FlowCellSample]]:
samples: list[IlluminaSampleIndexSetting],
) -> dict[int, list[IlluminaSampleIndexSetting]]:
"""Group and return samples by lane."""
LOG.debug("Order samples by lane")
sample_by_lane: dict[int, list[FlowCellSample]] = {}
sample_by_lane: dict[int, list[IlluminaSampleIndexSetting]] = {}
for sample in samples:
if sample.lane not in sample_by_lane:
sample_by_lane[sample.lane] = []
sample_by_lane[sample.lane].append(sample)
return sample_by_lane


def get_flow_cell_samples_from_content(
def get_samples_from_content(
sample_sheet_content: list[list[str]],
) -> list[FlowCellSample]:
) -> list[IlluminaSampleIndexSetting]:
"""
Return the samples in a sample sheet as a list of FlowCellSample objects.
Return the samples in a sample sheet as a list of IlluminaIndexSettings objects.
Raises:
ValidationError: if the samples do not have the correct attributes based on their model.
"""
raw_samples: list[dict[str, str]] = get_raw_samples_from_content(
sample_sheet_content=sample_sheet_content
)
adapter = TypeAdapter(list[FlowCellSample])
adapter = TypeAdapter(list[IlluminaSampleIndexSetting])
return adapter.validate_python(raw_samples)
12 changes: 7 additions & 5 deletions cg/apps/demultiplex/sample_sheet/sample_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
LOG = logging.getLogger(__name__)


class FlowCellSample(BaseModel):
"""Class that represents a flow cell sample."""
class IlluminaSampleIndexSetting(BaseModel):
"""Class that represents index settings for a sample on an Illumina run."""

lane: int = Field(..., alias=SampleSheetBCLConvertSections.Data.LANE)
sample_id: SampleId = Field(..., alias=SampleSheetBCLConvertSections.Data.SAMPLE_INTERNAL_ID)
Expand Down Expand Up @@ -80,7 +80,9 @@ def update_override_cycles(self, run_parameters: RunParameters) -> None:
)
self.override_cycles = read1_cycles + index1_cycles + index2_cycles + read2_cycles

def _update_barcode_mismatches_1(self, samples_to_compare: list["FlowCellSample"]) -> None:
def _update_barcode_mismatches_1(
self, samples_to_compare: list["IlluminaSampleIndexSetting"]
) -> None:
"""Assign zero to barcode_mismatches_1 if the hamming distance between self.index
and the index1 of any sample in the lane is below the minimum threshold."""
for sample in samples_to_compare:
Expand All @@ -96,7 +98,7 @@ def _update_barcode_mismatches_1(self, samples_to_compare: list["FlowCellSample"

def _update_barcode_mismatches_2(
self,
samples_to_compare: list["FlowCellSample"],
samples_to_compare: list["IlluminaSampleIndexSetting"],
is_reverse_complement: bool,
) -> None:
"""Assign zero to barcode_mismatches_2 if the hamming distance between self.index2
Expand Down Expand Up @@ -130,7 +132,7 @@ def process_indexes(self, run_parameters: RunParameters):

def update_barcode_mismatches(
self,
samples_to_compare: list["FlowCellSample"],
samples_to_compare: list["IlluminaSampleIndexSetting"],
is_run_single_index: bool,
is_reverse_complement: bool,
) -> None:
Expand Down
32 changes: 16 additions & 16 deletions cg/apps/demultiplex/sample_sheet/sample_sheet_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging

from cg.apps.demultiplex.sample_sheet.read_sample_sheet import get_samples_by_lane
from cg.apps.demultiplex.sample_sheet.sample_models import FlowCellSample
from cg.apps.demultiplex.sample_sheet.sample_models import IlluminaSampleIndexSetting
from cg.constants.demultiplexing import IndexSettings, SampleSheetBCLConvertSections
from cg.models.demultiplex.run_parameters import RunParameters
from cg.models.run_devices.illumina_run_directory_data import IlluminaRunDirectoryData
Expand All @@ -12,22 +12,22 @@


class SampleSheetCreator:
"""Base class for sample sheet creation."""
"""Base class for sample sheet creation for an Illumina run."""

def __init__(
self,
flow_cell: IlluminaRunDirectoryData,
lims_samples: list[FlowCellSample],
run_directory_data: IlluminaRunDirectoryData,
samples: list[IlluminaSampleIndexSetting],
):
self.flow_cell: IlluminaRunDirectoryData = flow_cell
self.flow_cell_id: str = flow_cell.id
self.lims_samples: list[FlowCellSample] = lims_samples
self.run_parameters: RunParameters = flow_cell.run_parameters
self.run_directory_data: IlluminaRunDirectoryData = run_directory_data
self.flow_cell_id: str = run_directory_data.id
self.samples: list[IlluminaSampleIndexSetting] = samples
self.run_parameters: RunParameters = run_directory_data.run_parameters
self.index_settings: IndexSettings = self.run_parameters.index_settings

def convert_sample_to_header_dict(
self,
sample: FlowCellSample,
sample: IlluminaSampleIndexSetting,
data_column_names: list[str],
) -> list[str]:
"""Convert a lims sample object to a list that corresponds to the sample sheet headers."""
Expand All @@ -49,7 +49,7 @@ def get_additional_sections_sample_sheet(self) -> list[list[str]]:
[
SampleSheetBCLConvertSections.Header.INSTRUMENT_PLATFORM_TITLE.value,
SampleSheetBCLConvertSections.Header.instrument_platform_sequencer().get(
self.flow_cell.sequencer_type
self.run_directory_data.sequencer_type
),
],
SampleSheetBCLConvertSections.Header.index_orientation_forward(),
Expand Down Expand Up @@ -102,7 +102,7 @@ def create_sample_sheet_content(self) -> list[list[str]]:
sample_sheet_content: list[list[str]] = (
self.get_additional_sections_sample_sheet() + complete_data_section
)
for sample in self.lims_samples:
for sample in self.samples:
sample_sheet_content.append(
self.convert_sample_to_header_dict(
sample=sample,
Expand All @@ -113,15 +113,15 @@ def create_sample_sheet_content(self) -> list[list[str]]:

def process_samples_for_sample_sheet(self) -> None:
"""Remove unwanted samples and adapt remaining samples."""
for lims_sample in self.lims_samples:
lims_sample.process_indexes(run_parameters=self.run_parameters)
for sample in self.samples:
sample.process_indexes(run_parameters=self.run_parameters)
is_reverse_complement: bool = (
self.index_settings.are_i5_override_cycles_reverse_complemented
)
for lane, samples_in_lane in get_samples_by_lane(self.lims_samples).items():
for lane, samples_in_lane in get_samples_by_lane(self.samples).items():
LOG.info(f"Updating barcode mismatch values for samples in lane {lane}")
for lims_sample in samples_in_lane:
lims_sample.update_barcode_mismatches(
for sample in samples_in_lane:
sample.update_barcode_mismatches(
samples_to_compare=samples_in_lane,
is_run_single_index=self.run_parameters.is_single_index,
is_reverse_complement=is_reverse_complement,
Expand Down
10 changes: 5 additions & 5 deletions cg/apps/demultiplex/sample_sheet/sample_sheet_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,25 @@

from pydantic import BaseModel

from cg.apps.demultiplex.sample_sheet.sample_models import FlowCellSample
from cg.apps.demultiplex.sample_sheet.sample_models import IlluminaSampleIndexSetting

LOG = logging.getLogger(__name__)


class SampleSheet(BaseModel):
samples: list[FlowCellSample]
samples: list[IlluminaSampleIndexSetting]

def get_non_pooled_lanes_and_samples(self) -> list[tuple[int, str]]:
"""Return tuples of non-pooled lane and sample ids."""
non_pooled_lane_sample_id_pairs: list[tuple[int, str]] = []
non_pooled_samples: list[FlowCellSample] = self.get_non_pooled_samples()
non_pooled_samples: list[IlluminaSampleIndexSetting] = self.get_non_pooled_samples()
for sample in non_pooled_samples:
non_pooled_lane_sample_id_pairs.append((sample.lane, sample.sample_id))
return non_pooled_lane_sample_id_pairs

def get_non_pooled_samples(self) -> list[FlowCellSample]:
def get_non_pooled_samples(self) -> list[IlluminaSampleIndexSetting]:
"""Return samples that are sequenced solo in their lane."""
lane_samples: dict[int, list[FlowCellSample]] = defaultdict(list)
lane_samples: dict[int, list[IlluminaSampleIndexSetting]] = defaultdict(list)
for sample in self.samples:
lane_samples[sample.lane].append(sample)
return [samples[0] for samples in lane_samples.values() if len(samples) == 1]
Expand Down
8 changes: 4 additions & 4 deletions cg/apps/demultiplex/sample_sheet/sample_sheet_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@

from cg.apps.demultiplex.sample_sheet.override_cycles_validator import OverrideCyclesValidator
from cg.apps.demultiplex.sample_sheet.read_sample_sheet import (
get_flow_cell_samples_from_content,
get_samples_from_content,
get_raw_samples_from_content,
validate_samples_unique_per_lane,
)
from cg.apps.demultiplex.sample_sheet.sample_models import FlowCellSample
from cg.apps.demultiplex.sample_sheet.sample_models import IlluminaSampleIndexSetting
from cg.apps.demultiplex.sample_sheet.sample_sheet_models import SampleSheet
from cg.constants.constants import FileFormat
from cg.constants.demultiplexing import NAME_TO_INDEX_SETTINGS, SampleSheetBCLConvertSections
Expand Down Expand Up @@ -115,7 +115,7 @@ def _validate_samples(self) -> None:
"""
LOG.debug("Validating samples")
try:
validated_samples: list[FlowCellSample] = get_flow_cell_samples_from_content(
validated_samples: list[IlluminaSampleIndexSetting] = get_samples_from_content(
sample_sheet_content=self.content
)
except ValidationError as error:
Expand Down Expand Up @@ -181,5 +181,5 @@ def get_sample_sheet_object_from_file(self, file_path: Path) -> SampleSheet:
SampleSheetError: If the sample sheet is not valid.
"""
self.validate_sample_sheet_from_file(file_path)
samples: list[FlowCellSample] = get_flow_cell_samples_from_content(self.content)
samples: list[IlluminaSampleIndexSetting] = get_samples_from_content(self.content)
return SampleSheet(samples=samples)
Loading

0 comments on commit e65ae73

Please sign in to comment.