From b77891925a147b2779a69a8aa4e3291eccda17d0 Mon Sep 17 00:00:00 2001 From: Matthew Solomonson Date: Tue, 19 Sep 2023 13:43:34 -0400 Subject: [PATCH] Add input validation for step 1 output --- .../gnomad_v4/types/prepare_variants_step1.py | 134 ++++++++++++++++++ data-pipeline/tests/v4/test_inputs.py | 29 +++- 2 files changed, 157 insertions(+), 6 deletions(-) create mode 100644 data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step1.py diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step1.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step1.py new file mode 100644 index 000000000..2c3284c12 --- /dev/null +++ b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step1.py @@ -0,0 +1,134 @@ +import attr +from typing import List, Set, Dict, Union + + +from data_pipeline.datasets.gnomad_v4.types.initial_variant import ( + InSilicoPredictors, + Vep, + TranscriptConsequence, + Rf, + InSilicoPredictors, + Grpmax, + Locus, +) + + +@attr.define +class Population: + id: str + ac: int + an: int + hemizygote_count: int + homozygote_count: int + + +@attr.define +class Freq: + ac: int + ac_raw: int + an: int + hemizygote_count: int + homozygote_count: int + populations: list[Population] + + +@attr.define +class FreqBySubset: + all: Freq + non_ukb: Freq + + +@attr.define +class FAF: + popmax: float + popmax_population: str + + +@attr.define +class BinDetails: + bin_edges: list[float] + bin_freq: list[int] + n_smaller: int + n_larger: int + + +@attr.define +class MetricsDetail: + bin_edges: list[float] + bin_freq: list[int] + n_smaller: int + n_larger: int + + +@attr.define +class MetricValue: + metric: str + value: Union[float, None] # Is it OK if some are none? + + +@attr.define +class AlleleBalanceQualityMetrics: + alt_adj: MetricsDetail + alt_raw: MetricsDetail + + +@attr.define +class GenotypeDepthQualityMetrics: + all_adj: MetricsDetail + all_raw: MetricsDetail + alt_adj: MetricsDetail + alt_raw: MetricsDetail + + +@attr.define +class GenotypeQualityQualityMetrics: + all_adj: MetricsDetail + all_raw: MetricsDetail + alt_adj: MetricsDetail + alt_raw: MetricsDetail + + +@attr.define +class QualityMetrics: + allele_balance: AlleleBalanceQualityMetrics + genotype_depth: GenotypeDepthQualityMetrics + genotype_quality: GenotypeQualityQualityMetrics + site_quality_metrics: List[MetricValue] + + +@attr.define +class AgeDistributions: + het: List[BinDetails] + hom: List[BinDetails] + + +@attr.define +class Gnomad: + freq: FreqBySubset + faf95: FAF + faf99: FAF + age_distribution: AgeDistributions + filters: set[str] + quality_metrics: QualityMetrics + + +@attr.define +class ColocatedVariants: + all: List[str] + non_ukb: List[str] + + +@attr.define +class Variant: + locus: Locus + alleles: list[str] + grpmax: List[Grpmax] + rsids: Union[Set[str], None] + vep: Union[Vep, None] + rf: Rf + in_silico_predictors: InSilicoPredictors + variant_id: str + colocated_variants: ColocatedVariants + gnomad: Gnomad + subsets: set[str] + flags: set[str] diff --git a/data-pipeline/tests/v4/test_inputs.py b/data-pipeline/tests/v4/test_inputs.py index c58890cd2..e0f40fd25 100644 --- a/data-pipeline/tests/v4/test_inputs.py +++ b/data-pipeline/tests/v4/test_inputs.py @@ -1,11 +1,18 @@ from typing import List -from cattrs import structure, transform_error -from data_pipeline.datasets.gnomad_v4.types.initial_variant import InitialVariant +from cattrs import structure, transform_error, structure_attrs_fromdict import hail as hl import json from loguru import logger +from data_pipeline.pipelines.gnomad_v4_variants import ( + pipeline as gnomad_v4_variant_pipeline, +) + from data_pipeline.datasets.gnomad_v4.types.initial_globals import Globals +from data_pipeline.datasets.gnomad_v4.types.initial_variant import InitialVariant +from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step1 import Variant as Step1Variant + +step1_task = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants") def ht_to_json(ht: hl.Table, field: str = "row"): @@ -25,14 +32,24 @@ def ht_to_json(ht: hl.Table, field: str = "row"): def test_globals_input_validation(): - ht = hl.read_table("./data/mock_v4_release.ht") + input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"] + ht = hl.read_table(input_path) result = ht_to_json(ht, "globals")[0] logger.info(result) structure(result, Globals) -def test_variant_input_validation(): - ht = hl.read_table("./data/mock_v4_release.ht") +def test_validate_variant_input(): + input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"] + ht = hl.read_table(input_path) + # ht = ht.sample(0.1, seed=1234) + result = ht_to_json(ht) + [structure_attrs_fromdict(variant, InitialVariant) for variant in result] + + +def test_validate_step1_output(): + output_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_output_path() + ht = hl.read_table(output_path) # ht = ht.sample(0.1, seed=1234) result = ht_to_json(ht) - structure(result, List[InitialVariant]) + [structure_attrs_fromdict(variant, Step1Variant) for variant in result]