Skip to content

Commit

Permalink
refctor(sequencing metrics parser) (#3241) (patch)
Browse files Browse the repository at this point in the history
# Description

Refactoring of sequencing metrics parser
  • Loading branch information
ChrOertlin authored May 20, 2024
1 parent 75afb17 commit 71217a1
Show file tree
Hide file tree
Showing 19 changed files with 137 additions and 103 deletions.
76 changes: 0 additions & 76 deletions cg/apps/sequencing_metrics_parser/api.py

This file was deleted.

6 changes: 4 additions & 2 deletions cg/meta/demultiplex/demux_post_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@ class DemuxPostProcessingAPI:

def __init__(self, config: CGConfig) -> None:
self.config: CGConfig = config
self.flow_cells_dir: Path = Path(config.illumina_flow_cells_directory)
self.demultiplexed_runs_dir: Path = Path(config.illumina_demultiplexed_runs_directory)
self.flow_cells_dir: Path = Path(config.run_instruments.illumina.flow_cell_runs_dir)
self.demultiplexed_runs_dir: Path = Path(
config.run_instruments.illumina.demultiplexed_runs_dir
)
self.status_db: Store = config.status_db
self.hk_api: HousekeeperAPI = config.housekeeper_api
self.dry_run: bool = False
Expand Down
16 changes: 10 additions & 6 deletions cg/meta/demultiplex/status_db_storage_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import datetime
import logging

from cg.apps.sequencing_metrics_parser.api import (
create_sample_lane_sequencing_metrics_for_flow_cell,
create_undetermined_non_pooled_metrics,
)

from cg.constants import FlowCellStatus
from cg.meta.demultiplex.combine_sequencing_metrics import combine_mapped_metrics_with_undetermined
from cg.meta.demultiplex.utils import get_q30_threshold
from cg.models.flow_cell.flow_cell import FlowCellDirectoryData
from cg.services.bcl_convert_metrics_service.bcl_convert_metrics_service import (
BCLConvertMetricsService,
)

from cg.store.models import Flowcell, Sample, SampleLaneSequencingMetrics
from cg.store.store import Store

Expand Down Expand Up @@ -43,11 +44,14 @@ def store_flow_cell_data_in_status_db(


def store_sequencing_metrics_in_status_db(flow_cell: FlowCellDirectoryData, store: Store) -> None:
metrics_service = BCLConvertMetricsService()
mapped_metrics: list[SampleLaneSequencingMetrics] = (
create_sample_lane_sequencing_metrics_for_flow_cell(flow_cell_directory=flow_cell.path)
metrics_service.create_sample_lane_sequencing_metrics_for_flow_cell(
flow_cell_directory=flow_cell.path
)
)
undetermined_metrics: list[SampleLaneSequencingMetrics] = (
create_undetermined_non_pooled_metrics(flow_cell)
metrics_service.create_undetermined_non_pooled_metrics(flow_cell)
)

combined_metrics = combine_mapped_metrics_with_undetermined(
Expand Down
4 changes: 3 additions & 1 deletion cg/meta/demultiplex/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ def validate_flow_cell_has_fastq_files(flow_cell: FlowCellDirectoryData) -> None
if fastq_files:
LOG.debug(f"Flow cell {flow_cell.id} has at least one sample with fastq files")
return
raise MissingFilesError(f"No fastq files were found for any sample in flow cell {flow_cell.id}")
raise MissingFilesError(
f"No fastq files were found for any sample in flow cell {flow_cell.id} path: {flow_cell.path}"
)


def is_flow_cell_ready_for_postprocessing(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from datetime import datetime
from pathlib import Path


from cg.constants.demultiplexing import UNDETERMINED
from cg.models.flow_cell.flow_cell import FlowCellDirectoryData
from cg.services.bcl_convert_metrics_service.parser import MetricsParser
from cg.store.models import SampleLaneSequencingMetrics
from cg.utils.flow_cell import get_flow_cell_id


class BCLConvertMetricsService:
def create_sample_lane_sequencing_metrics_for_flow_cell(
self,
flow_cell_directory: Path,
) -> list[SampleLaneSequencingMetrics]:
"""Parse the demultiplexing metrics data into the sequencing statistics model."""
metrics_parser = MetricsParser(flow_cell_directory)
sample_internal_ids: list[str] = metrics_parser.get_sample_internal_ids()
sample_lane_sequencing_metrics: list[SampleLaneSequencingMetrics] = []

for sample_internal_id in sample_internal_ids:
for lane in metrics_parser.get_lanes_for_sample(sample_internal_id=sample_internal_id):
metrics: SampleLaneSequencingMetrics = self._create_bcl_convert_sequencing_metrics(
sample_internal_id=sample_internal_id, lane=lane, metrics_parser=metrics_parser
)
sample_lane_sequencing_metrics.append(metrics)
return sample_lane_sequencing_metrics

def create_undetermined_non_pooled_metrics(
self,
flow_cell: FlowCellDirectoryData,
) -> list[SampleLaneSequencingMetrics]:
"""Return sequencing metrics for any undetermined reads in non-pooled lanes."""

non_pooled_lanes_and_samples: list[tuple[int, str]] = (
flow_cell.sample_sheet.get_non_pooled_lanes_and_samples()
)
metrics_parser = MetricsParser(flow_cell.path)
undetermined_metrics: list[SampleLaneSequencingMetrics] = []

for lane, sample_internal_id in non_pooled_lanes_and_samples:
if not metrics_parser.has_undetermined_reads_in_lane(lane):
continue

# Passing Undetermined as the sample id is required to extract the undetermined reads data.
# BclConvert tags undetermined reads in a lane with the sample id "Undetermined".
metrics: SampleLaneSequencingMetrics = self._create_bcl_convert_sequencing_metrics(
sample_internal_id=UNDETERMINED, lane=lane, metrics_parser=metrics_parser
)
metrics.sample_internal_id = sample_internal_id
undetermined_metrics.append(metrics)
return undetermined_metrics

@staticmethod
def _create_bcl_convert_sequencing_metrics(
sample_internal_id: str, lane: int, metrics_parser: MetricsParser
) -> SampleLaneSequencingMetrics:
"""Create sequencing metrics for a sample in a lane."""
flow_cell_id: str = get_flow_cell_id(metrics_parser.bcl_convert_demultiplex_dir.name)

total_reads: int = metrics_parser.calculate_total_reads_for_sample_in_lane(
sample_internal_id=sample_internal_id, lane=lane
)
q30_bases_percent: float = metrics_parser.get_q30_bases_percent_for_sample_in_lane(
sample_internal_id=sample_internal_id, lane=lane
)
mean_quality_score: float = metrics_parser.get_mean_quality_score_for_sample_in_lane(
sample_internal_id=sample_internal_id, lane=lane
)
return SampleLaneSequencingMetrics(
sample_internal_id=sample_internal_id,
flow_cell_name=flow_cell_id,
flow_cell_lane_number=lane,
sample_total_reads_in_lane=total_reads,
sample_base_percentage_passing_q30=q30_bases_percent,
sample_base_mean_quality_score=mean_quality_score,
created_at=datetime.now(),
)
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Callable

from cg.apps.demultiplex.sample_sheet.validators import is_valid_sample_internal_id
from cg.apps.sequencing_metrics_parser.models import DemuxMetrics, SequencingQualityMetrics

from cg.constants.constants import SCALE_TO_READ_PAIRS, FileFormat
from cg.constants.demultiplexing import UNDETERMINED
from cg.constants.metrics import (
Expand All @@ -14,6 +14,7 @@
QUALITY_METRICS_FILE_NAME,
)
from cg.io.controller import ReadFile
from cg.services.bcl_convert_metrics_service.models import DemuxMetrics, SequencingQualityMetrics
from cg.utils.files import get_file_in_directory

LOG = logging.getLogger(__name__)
Expand Down Expand Up @@ -44,8 +45,9 @@ def __init__(
metrics_model=DemuxMetrics,
)

@staticmethod
def parse_metrics_file(
self, metrics_file_path, metrics_model: Callable
metrics_file_path, metrics_model: Callable
) -> list[SequencingQualityMetrics | DemuxMetrics]:
"""Parse specified metrics file."""
LOG.info(f"Parsing BCLConvert metrics file: {metrics_file_path}")
Expand Down Expand Up @@ -75,8 +77,8 @@ def get_lanes_for_sample(self, sample_internal_id: str) -> list[int]:
lanes_for_sample.append(sample_demux_metric.lane)
return lanes_for_sample

@staticmethod
def get_metrics_for_sample_and_lane(
self,
metrics: list[SequencingQualityMetrics | DemuxMetrics],
sample_internal_id: str,
lane: int,
Expand Down
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@
TaxprofilerSampleSheetEntry,
)
from cg.models.tomte.tomte import TomteParameters, TomteSampleSheetHeaders
from cg.services.bcl_convert_metrics_service.bcl_convert_metrics_service import (
BCLConvertMetricsService,
)
from cg.store.database import create_all_tables, drop_all_tables, initialize_database
from cg.store.models import Bed, BedVersion, Case, Customer, Order, Organism, Sample
from cg.store.store import Store
Expand Down Expand Up @@ -3955,3 +3958,8 @@ def fastq_file_meta_raw(flow_cell_name: str) -> dict:
"flow_cell_id": flow_cell_name,
"undetermined": None,
}


@pytest.fixture()
def bcl_convert_metrics_service() -> BCLConvertMetricsService:
return BCLConvertMetricsService()
File renamed without changes.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,24 @@

import pytest

from cg.apps.sequencing_metrics_parser.models import DemuxMetrics, SequencingQualityMetrics
from cg.apps.sequencing_metrics_parser.parser import MetricsParser

from cg.constants.metrics import DemuxMetricsColumnNames, QualityMetricsColumnNames
from cg.services.bcl_convert_metrics_service.bcl_convert_metrics_service import (
BCLConvertMetricsService,
)
from cg.services.bcl_convert_metrics_service.models import DemuxMetrics, SequencingQualityMetrics
from cg.services.bcl_convert_metrics_service.parser import MetricsParser


@pytest.fixture(scope="session")
def bcl_convert_metrics_dir_path() -> Path:
"""Return a path to a BCLConvert metrics directory."""
return Path(
"tests", "fixtures", "apps", "sequencing_metrics_parser", "230622_A00621_0864_AHY7FFDRX2"
"tests",
"fixtures",
"services",
"bcl_convert_metrics_service",
"230622_A00621_0864_AHY7FFDRX2",
)


Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import pytest

from cg.apps.sequencing_metrics_parser.models import DemuxMetrics, SequencingQualityMetrics
from cg.apps.sequencing_metrics_parser.parser import MetricsParser
from cg.services.bcl_convert_metrics_service.models import SequencingQualityMetrics, DemuxMetrics
from cg.services.bcl_convert_metrics_service.parser import MetricsParser


def test_parse_metrics(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,26 @@

from pathlib import Path

from cg.apps.sequencing_metrics_parser.api import (
create_sample_lane_sequencing_metrics_for_flow_cell,
create_undetermined_non_pooled_metrics,
)
from cg.apps.sequencing_metrics_parser.parser import MetricsParser
from cg.models.flow_cell.flow_cell import FlowCellDirectoryData
from cg.services.bcl_convert_metrics_service.bcl_convert_metrics_service import (
BCLConvertMetricsService,
)

from cg.services.bcl_convert_metrics_service.parser import MetricsParser
from cg.store.models import SampleLaneSequencingMetrics


def test_create_sample_lane_sequencing_metrics_for_flow_cell(
bcl_convert_metrics_dir_path: Path,
parsed_bcl_convert_metrics: MetricsParser,
bcl_convert_metrics_service: BCLConvertMetricsService,
):
"""Test to create sequencing statistics from bcl convert metrics."""
# GIVEN a parsed bcl convert metrics file

# WHEN creating sequencing statistics from bcl convert metrics
sequencing_statistics_list: list[SampleLaneSequencingMetrics] = (
create_sample_lane_sequencing_metrics_for_flow_cell(
bcl_convert_metrics_service.create_sample_lane_sequencing_metrics_for_flow_cell(
flow_cell_directory=bcl_convert_metrics_dir_path,
)
)
Expand All @@ -38,13 +39,16 @@ def test_create_sample_lane_sequencing_metrics_for_flow_cell(

def test_create_undetermined_non_pooled_metrics(
hiseq_x_single_index_demultiplexed_flow_cell_with_sample_sheet: FlowCellDirectoryData,
bcl_convert_metrics_service: BCLConvertMetricsService,
):
"""Test creating undetermined sequencing statistics from demultiplex metrics."""
# GIVEN a directory with a demultiplexed flow cell with undetermined reads

# WHEN creating undetermined sequencing statistics from bcl convert metrics
metrics: list[SampleLaneSequencingMetrics] = create_undetermined_non_pooled_metrics(
flow_cell=hiseq_x_single_index_demultiplexed_flow_cell_with_sample_sheet
metrics: list[SampleLaneSequencingMetrics] = (
bcl_convert_metrics_service.create_undetermined_non_pooled_metrics(
flow_cell=hiseq_x_single_index_demultiplexed_flow_cell_with_sample_sheet
)
)

# THEN metrics are created for the undetermined reads
Expand All @@ -54,6 +58,7 @@ def test_create_undetermined_non_pooled_metrics(

def test_create_undetermined_non_pooled_metrics_for_existing_lane_without_undetermined_reads(
bcl_convert_metrics_dir_path: Path,
bcl_convert_metrics_service: BCLConvertMetricsService,
):
"""
Test creating undetermined sequencing statistics from demultiplex metrics without undetermined
Expand All @@ -66,8 +71,8 @@ def test_create_undetermined_non_pooled_metrics_for_existing_lane_without_undete
flow_cell.set_sample_sheet_path_hk(hk_path=sample_sheet_path)

# WHEN creating undetermined sequencing statistics specifying a lane without undetermined reads
metrics: list[SampleLaneSequencingMetrics] = create_undetermined_non_pooled_metrics(
flow_cell=flow_cell
metrics: list[SampleLaneSequencingMetrics] = (
bcl_convert_metrics_service.create_undetermined_non_pooled_metrics(flow_cell=flow_cell)
)

# THEN an empty list is returned
Expand Down

0 comments on commit 71217a1

Please sign in to comment.