Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add raredisease pedigree check #3687

Merged
merged 13 commits into from
Sep 11, 2024
6 changes: 5 additions & 1 deletion cg/constants/nf_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,16 @@ class NfTowerStatus(StrEnum):
RAREDISEASE_METRIC_CONDITIONS: dict[str, dict[str, Any]] = {
"percent_duplicates": {"norm": "lt", "threshold": 20},
"PCT_PF_UQ_READS_ALIGNED": {"norm": "gt", "threshold": 0.95},
"MEDIAN_TARGET_COVERAGE": {"norm": "gt", "threshold": 26},
"MEDIAN_TARGET_COVERAGE": {"norm": "gt", "threshold": 25},
"PCT_TARGET_BASES_10X": {"norm": "gt", "threshold": 0.95},
"PCT_EXC_ADAPTER": {"norm": "lt", "threshold": 0.0005},
"predicted_sex_sex_check": {"norm": "eq", "threshold": None},
}

RAREDISEASE_PARENT_PEDDY_METRIC_CONDITION: dict[str, dict[str, Any]] = {
"parent_error_ped_check": {"norm": "eq", "threshold": "False"},
}

RNAFUSION_METRIC_CONDITIONS: dict[str, dict[str, Any]] = {
"uniquely_mapped_percent": {"norm": "gt", "threshold": 60},
"PCT_MRNA_BASES": {"norm": "gt", "threshold": 80},
Expand Down
35 changes: 21 additions & 14 deletions cg/meta/workflow/nf_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,11 +631,10 @@ def get_workflow_metrics(self, metric_id: str) -> dict:
def get_multiqc_search_patterns(self, case_id: str) -> dict:
"""Return search patterns for MultiQC. Each key is a search pattern and each value
corresponds to the metric ID to set in the metrics deliverables file.
Multiple search patterns can be added. Ideally patterns used should be sample ids, e.g.
Multiple search patterns can be added. Ideally, used patterns should be sample ids, e.g.
{sample_id_1: sample_id_1, sample_id_2: sample_id_2}."""
sample_ids: Iterator[str] = self.status_db.get_sample_ids_by_case_id(case_id=case_id)
search_patterns: dict[str, str] = {sample_id: sample_id for sample_id in sample_ids}
return search_patterns
sample_ids: Iterator[str] = self.status_db.get_sample_ids_by_case_id(case_id)
return {sample_id: sample_id for sample_id in sample_ids}
peterpru marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def get_deduplicated_metrics(metrics: list[MetricsBase]) -> list[MetricsBase]:
Expand All @@ -649,11 +648,13 @@ def get_deduplicated_metrics(metrics: list[MetricsBase]) -> list[MetricsBase]:
deduplicated_metrics.append(metric)
return deduplicated_metrics

def get_multiqc_data_json(self, case_id: str) -> MultiqcDataJson:
"""Return a MultiqcDataJson object."""
peterpru marked this conversation as resolved.
Show resolved Hide resolved
return MultiqcDataJson(**read_json(file_path=self.get_multiqc_json_path(case_id=case_id)))

def get_multiqc_json_metrics(self, case_id: str) -> list[MetricsBase]:
"""Return a list of the metrics specified in a MultiQC json file."""
multiqc_json = MultiqcDataJson(
**read_json(file_path=self.get_multiqc_json_path(case_id=case_id))
)
multiqc_json: MultiqcDataJson = self.get_multiqc_data_json(case_id=case_id)
metrics = []
for search_pattern, metric_id in self.get_multiqc_search_patterns(case_id=case_id).items():
metrics_for_pattern: list[MetricsBase] = (
Expand All @@ -668,6 +669,14 @@ def get_multiqc_json_metrics(self, case_id: str) -> list[MetricsBase]:
metrics = self.get_deduplicated_metrics(metrics=metrics)
return metrics

@staticmethod
def _is_pattern_found(pattern: str, text: str, exact_match: bool) -> bool:
if exact_match:
is_pattern_found: bool = pattern == text
else:
is_pattern_found: bool = pattern in text
return is_pattern_found

def get_metrics_from_multiqc_json_with_pattern(
self,
search_pattern: str,
Expand All @@ -678,13 +687,11 @@ def get_metrics_from_multiqc_json_with_pattern(
"""Parse a MultiqcDataJson and returns a list of metrics."""
metrics: list[MetricsBase] = []
for section in multiqc_json.report_general_stats_data:
for section_name, section_values in section.items():
if exact_match:
is_pattern_found: bool = search_pattern == section_name
else:
is_pattern_found: bool = search_pattern in section_name
if is_pattern_found:
for metric_name, metric_value in section_values.items():
for subsection, metrics_dict in section.items():
if self._is_pattern_found(
pattern=search_pattern, text=subsection, exact_match=exact_match
):
for metric_name, metric_value in metrics_dict.items():
metric: MetricsBase = self.get_multiqc_metric(
metric_name=metric_name, metric_value=metric_value, metric_id=metric_id
)
Expand Down
54 changes: 52 additions & 2 deletions cg/meta/workflow/raredisease.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module for Raredisease Analysis API."""

import logging
from itertools import permutations
from pathlib import Path
from typing import Any

Expand All @@ -20,11 +21,13 @@
RAREDISEASE_COVERAGE_INTERVAL_TYPE,
RAREDISEASE_COVERAGE_THRESHOLD,
RAREDISEASE_METRIC_CONDITIONS,
RAREDISEASE_PARENT_PEDDY_METRIC_CONDITION,
)
from cg.constants.scout import RAREDISEASE_CASE_TAGS
from cg.constants.subject import PlinkPhenotypeStatus, PlinkSex
from cg.meta.workflow.nf_analysis import NfAnalysisAPI
from cg.models.cg_config import CGConfig
from cg.models.deliverables.metric_deliverables import MetricsBase, MultiqcDataJson
from cg.models.raredisease.raredisease import (
RarediseaseParameters,
RarediseaseSampleSheetEntry,
Expand Down Expand Up @@ -149,11 +152,58 @@ def get_managed_variants(self) -> list[str]:
return self._get_managed_variants(genome_build=GenePanelGenomeBuild.hg19)

def get_workflow_metrics(self, sample_id: str) -> dict:
"""Return Raredisease workflow metric conditions for a sample."""
sample: Sample = self.status_db.get_sample_by_internal_id(internal_id=sample_id)
metric_conditions: dict[str, dict[str, Any]] = dict(RAREDISEASE_METRIC_CONDITIONS)
self.set_order_sex_for_sample(sample, metric_conditions)
if "-" not in sample_id:
peterpru marked this conversation as resolved.
Show resolved Hide resolved
metric_conditions: dict[str, dict[str, Any]] = RAREDISEASE_METRIC_CONDITIONS.copy()
self.set_order_sex_for_sample(sample, metric_conditions)
else:
metric_conditions = RAREDISEASE_PARENT_PEDDY_METRIC_CONDITION.copy()
return metric_conditions

def _get_sample_pair_patterns(self, case_id: str) -> list[str]:
"""Return sample-pair patterns for searching in MultiQC."""
sample_ids: list[str] = list(self.status_db.get_sample_ids_by_case_id(case_id=case_id))
peterpru marked this conversation as resolved.
Show resolved Hide resolved
pairwise_patterns: list[str] = [
f"{sample1}-{sample2}" for sample1, sample2 in permutations(sample_ids, 2)
]
return pairwise_patterns

def get_parent_error_ped_check_metric(
self, pair_sample_ids: str, multiqc_raw_data: dict[dict]
) -> MetricsBase | None:
"""Return the parsed metrics for pedigree error given a concatenated pair of sample ids."""
metric_name: str = "parent_error_ped_check"
peddy_metrics: dict[str, dict] = multiqc_raw_data["multiqc_peddy"]
if sample_pair_metrics := peddy_metrics.get(pair_sample_ids, None):
return self.get_multiqc_metric(
metric_name=metric_name,
metric_value=sample_pair_metrics[metric_name],
metric_id=pair_sample_ids,
)

def get_multiqc_json_metrics(self, case_id: str) -> list[MetricsBase]:
"""Return a list of the metrics specified in a MultiQC json file."""
multiqc_json: MultiqcDataJson = self.get_multiqc_data_json(case_id=case_id)
metrics = []
for search_pattern, metric_id in self.get_multiqc_search_patterns(case_id).items():
peterpru marked this conversation as resolved.
Show resolved Hide resolved
metrics_for_pattern: list[MetricsBase] = (
self.get_metrics_from_multiqc_json_with_pattern(
search_pattern=search_pattern,
multiqc_json=multiqc_json,
metric_id=metric_id,
exact_match=self.is_multiqc_pattern_search_exact,
)
)
metrics.extend(metrics_for_pattern)
for sample_pair in self._get_sample_pair_patterns(case_id):
if parent_error_metric := self.get_parent_error_ped_check_metric(
pair_sample_ids=sample_pair, multiqc_raw_data=multiqc_json.report_saved_raw_data
peterpru marked this conversation as resolved.
Show resolved Hide resolved
):
metrics.append(parent_error_metric)
peterpru marked this conversation as resolved.
Show resolved Hide resolved
metrics = self.get_deduplicated_metrics(metrics=metrics)
return metrics

@staticmethod
def set_order_sex_for_sample(sample: Sample, metric_conditions: dict) -> None:
metric_conditions["predicted_sex_sex_check"]["threshold"] = sample.sex
Expand Down
1 change: 1 addition & 0 deletions cg/models/deliverables/metric_deliverables.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,4 @@ class MultiqcDataJson(BaseModel):

report_general_stats_data: list[dict] | None
report_data_sources: dict | None
report_saved_raw_data: dict[str, dict] | None
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ metrics:
value: 24.546493
- condition:
norm: gt
threshold: 26.0
threshold: 25.0
header: null
id: ADM1
input: multiqc_data.json
Expand Down
Loading