Skip to content

Commit

Permalink
Add input validation for step 1 output
Browse files Browse the repository at this point in the history
  • Loading branch information
mattsolo1 committed Sep 19, 2023
1 parent fc6e346 commit b778919
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import attr
from typing import List, Set, Dict, Union


from data_pipeline.datasets.gnomad_v4.types.initial_variant import (
InSilicoPredictors,
Vep,
TranscriptConsequence,
Rf,
InSilicoPredictors,
Grpmax,
Locus,
)


@attr.define
class Population:
id: str
ac: int
an: int
hemizygote_count: int
homozygote_count: int


@attr.define
class Freq:
ac: int
ac_raw: int
an: int
hemizygote_count: int
homozygote_count: int
populations: list[Population]


@attr.define
class FreqBySubset:
all: Freq
non_ukb: Freq


@attr.define
class FAF:
popmax: float
popmax_population: str


@attr.define
class BinDetails:
bin_edges: list[float]
bin_freq: list[int]
n_smaller: int
n_larger: int


@attr.define
class MetricsDetail:
bin_edges: list[float]
bin_freq: list[int]
n_smaller: int
n_larger: int


@attr.define
class MetricValue:
metric: str
value: Union[float, None] # Is it OK if some are none?


@attr.define
class AlleleBalanceQualityMetrics:
alt_adj: MetricsDetail
alt_raw: MetricsDetail


@attr.define
class GenotypeDepthQualityMetrics:
all_adj: MetricsDetail
all_raw: MetricsDetail
alt_adj: MetricsDetail
alt_raw: MetricsDetail


@attr.define
class GenotypeQualityQualityMetrics:
all_adj: MetricsDetail
all_raw: MetricsDetail
alt_adj: MetricsDetail
alt_raw: MetricsDetail


@attr.define
class QualityMetrics:
allele_balance: AlleleBalanceQualityMetrics
genotype_depth: GenotypeDepthQualityMetrics
genotype_quality: GenotypeQualityQualityMetrics
site_quality_metrics: List[MetricValue]


@attr.define
class AgeDistributions:
het: List[BinDetails]
hom: List[BinDetails]


@attr.define
class Gnomad:
freq: FreqBySubset
faf95: FAF
faf99: FAF
age_distribution: AgeDistributions
filters: set[str]
quality_metrics: QualityMetrics


@attr.define
class ColocatedVariants:
all: List[str]
non_ukb: List[str]


@attr.define
class Variant:
locus: Locus
alleles: list[str]
grpmax: List[Grpmax]
rsids: Union[Set[str], None]
vep: Union[Vep, None]
rf: Rf
in_silico_predictors: InSilicoPredictors
variant_id: str
colocated_variants: ColocatedVariants
gnomad: Gnomad
subsets: set[str]
flags: set[str]
29 changes: 23 additions & 6 deletions data-pipeline/tests/v4/test_inputs.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from typing import List
from cattrs import structure, transform_error
from data_pipeline.datasets.gnomad_v4.types.initial_variant import InitialVariant
from cattrs import structure, transform_error, structure_attrs_fromdict
import hail as hl
import json
from loguru import logger

from data_pipeline.pipelines.gnomad_v4_variants import (
pipeline as gnomad_v4_variant_pipeline,
)

from data_pipeline.datasets.gnomad_v4.types.initial_globals import Globals
from data_pipeline.datasets.gnomad_v4.types.initial_variant import InitialVariant
from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step1 import Variant as Step1Variant

step1_task = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants")


def ht_to_json(ht: hl.Table, field: str = "row"):
Expand All @@ -25,14 +32,24 @@ def ht_to_json(ht: hl.Table, field: str = "row"):


def test_globals_input_validation():
ht = hl.read_table("./data/mock_v4_release.ht")
input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"]
ht = hl.read_table(input_path)
result = ht_to_json(ht, "globals")[0]
logger.info(result)
structure(result, Globals)


def test_variant_input_validation():
ht = hl.read_table("./data/mock_v4_release.ht")
def test_validate_variant_input():
input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"]
ht = hl.read_table(input_path)
# ht = ht.sample(0.1, seed=1234)
result = ht_to_json(ht)
[structure_attrs_fromdict(variant, InitialVariant) for variant in result]


def test_validate_step1_output():
output_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_output_path()
ht = hl.read_table(output_path)
# ht = ht.sample(0.1, seed=1234)
result = ht_to_json(ht)
structure(result, List[InitialVariant])
[structure_attrs_fromdict(variant, Step1Variant) for variant in result]

0 comments on commit b778919

Please sign in to comment.