diff --git a/data-pipeline/schemas/gnomad_v4_variants/prepare_gnomad_v4_exome_variants/input_path/mock_v4_release.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/prepare_gnomad_v4_exome_variants/input_path/mock_v4_release.ht.schema new file mode 100644 index 000000000..cb85362bc --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/prepare_gnomad_v4_exome_variants/input_path/mock_v4_release.ht.schema @@ -0,0 +1,338 @@ +---------------------------------------- +Global fields: + 'freq_meta': array> + 'freq_index_dict': dict + 'faf_meta': array> + 'faf_index_dict': dict + 'freq_sample_count': array + 'filtering_model': struct { + model_name: str, + score_name: str, + feature_medians: dict, + variants_by_strata: dict, + features_importance: dict, + features: array, + test_results: array, + rf_snv_cutoff: struct { + bin: float64, + min_score: float64 + }, + rf_indel_cutoff: struct { + bin: float64, + min_score: float64 + }, + inbreeding_cutoff: float64, + model_id: str + } + 'tool_versions': struct { + dbsnp_version: str, + cadd_version: str, + revel_version: str, + splicaai_version: str, + primateai_version: str, + pangolin_version: str, + vrs_version: str + } + 'vep_globals': struct { + vep_version: str, + vep_csq_header: str, + vep_help: str, + vep_config: str + } + 'age_distribution': struct { + bin_edges: array, + bin_freq: array, + n_smaller: int32, + n_larger: int32 + } + 'age_index_dict': dict + 'age_meta': array> + 'grpmax_index_dict': dict + 'grpmax_meta': array> + 'README': dict + 'gnomad_qc_repo': str + 'gnomad_methods_repo': str +---------------------------------------- +Row fields: + 'locus': locus + 'alleles': array + 'freq': array + 'grpmax': array + 'faf': array + 'a_index': int32 + 'was_split': bool + 'rsid': set + 'filters': set + 'info': struct { + QUALapprox: int64, + SB: array, + MQ: float64, + MQRankSum: float64, + VarDP: int32, + AS_ReadPosRankSum: float64, + AS_pab_max: float64, + AS_QD: float32, + AS_MQ: float64, + QD: float32, + AS_MQRankSum: float64, + FS: float64, + AS_FS: float64, + ReadPosRankSum: float64, + AS_QUALapprox: int64, + AS_SB_TABLE: array, + AS_VarDP: int32, + AS_SOR: float64, + SOR: float64, + singleton: bool, + transmitted_singleton: bool, + omni: bool, + mills: bool, + monoallelic: bool, + AS_VQSLOD: float64, + InbreedingCoeff: float64, + vrs: struct { + VRS_Allele_IDs: array, + VRS_Starts: array, + VRS_Ends: array, + VRS_States: array + } + } + 'vep': struct { + allele_string: str, + end: int32, + id: str, + input: str, + intergenic_consequences: array, + impact: str, + variant_allele: str + }>, + most_severe_consequence: str, + motif_feature_consequences: array, + high_inf_pos: str, + impact: str, + motif_feature_id: str, + motif_name: str, + motif_pos: int32, + motif_score_change: float64, + transcription_factors: array, + strand: int32, + variant_allele: str + }>, + regulatory_feature_consequences: array, + impact: str, + regulatory_feature_id: str, + variant_allele: str + }>, + seq_region_name: str, + start: int32, + strand: int32, + transcript_consequences: array, + distance: int32, + domains: array, + exon: str, + flags: str, + gene_id: str, + gene_pheno: int32, + gene_symbol: str, + gene_symbol_source: str, + hgnc_id: str, + hgvsc: str, + hgvsp: str, + hgvs_offset: int32, + impact: str, + intron: str, + lof: str, + lof_flags: str, + lof_filter: str, + lof_info: str, + mane_select: str, + mane_plus_clinical: str, + mirna: array, + polyphen_prediction: str, + polyphen_score: float64, + protein_end: int32, + protein_start: int32, + protein_id: str, + sift_prediction: str, + sift_score: float64, + source: str, + strand: int32, + transcript_id: str, + tsl: int32, + uniprot_isoform: array, + variant_allele: str + }>, + variant_class: str + } + 'rf': struct { + rf_positive_label: bool, + rf_negative_label: bool, + rf_label: str, + rf_train: bool, + rf_tp_probability: float64 + } + 'region_flag': struct { + lcr: bool, + segdup: bool, + non_par: bool + } + 'allele_info': struct { + variant_type: str, + allele_type: str, + n_alt_alleles: int32, + was_mixed: bool + } + 'histograms': struct { + qual_hists: struct { + gq_hist_all: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + dp_hist_all: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + gq_hist_alt: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + dp_hist_alt: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + ab_hist_alt: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + raw_qual_hists: struct { + gq_hist_all: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + dp_hist_all: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + gq_hist_alt: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + dp_hist_alt: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + ab_hist_alt: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + age_hists: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + age_hist_ht: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }> + } + 'in_silico_predictors': struct { + cadd: struct { + phred: float32, + raw_score: float32, + has_duplicate: bool + }, + revel: struct { + revel_score: float64, + has_duplicate: bool + }, + splice_ai: struct { + splice_ai_score: float32, + splice_consequence: str, + has_duplicate: bool + }, + pangolin: struct { + pangolin_score: float64 + } + } +---------------------------------------- +Key: ['locus', 'alleles'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/schemas/gnomad_v4_variants/prepare_gnomad_v4_exome_variants/output/gnomad_v4_exome_variants_base.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/prepare_gnomad_v4_exome_variants/output/gnomad_v4_exome_variants_base.ht.schema new file mode 100644 index 000000000..89fba5a59 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/prepare_gnomad_v4_exome_variants/output/gnomad_v4_exome_variants_base.ht.schema @@ -0,0 +1,334 @@ +---------------------------------------- +Global fields: + 'freq_meta': array> + 'freq_index_dict': dict + 'faf_meta': array> + 'faf_index_dict': dict + 'freq_sample_count': array + 'filtering_model': struct { + model_name: str, + score_name: str, + feature_medians: dict, + variants_by_strata: dict, + features_importance: dict, + features: array, + test_results: array, + rf_snv_cutoff: struct { + bin: float64, + min_score: float64 + }, + rf_indel_cutoff: struct { + bin: float64, + min_score: float64 + }, + inbreeding_cutoff: float64, + model_id: str + } + 'tool_versions': struct { + dbsnp_version: str, + cadd_version: str, + revel_version: str, + splicaai_version: str, + primateai_version: str, + pangolin_version: str, + vrs_version: str + } + 'vep_globals': struct { + vep_version: str, + vep_csq_header: str, + vep_help: str, + vep_config: str + } + 'age_distribution': struct { + bin_edges: array, + bin_freq: array, + n_smaller: int32, + n_larger: int32 + } + 'age_index_dict': dict + 'age_meta': array> + 'grpmax_index_dict': dict + 'grpmax_meta': array> + 'README': dict + 'gnomad_qc_repo': str + 'gnomad_methods_repo': str +---------------------------------------- +Row fields: + 'locus': locus + 'alleles': array + 'grpmax': array + 'rsids': set + 'vep': struct { + allele_string: str, + end: int32, + id: str, + input: str, + intergenic_consequences: array, + impact: str, + variant_allele: str + }>, + most_severe_consequence: str, + motif_feature_consequences: array, + high_inf_pos: str, + impact: str, + motif_feature_id: str, + motif_name: str, + motif_pos: int32, + motif_score_change: float64, + transcription_factors: array, + strand: int32, + variant_allele: str + }>, + regulatory_feature_consequences: array, + impact: str, + regulatory_feature_id: str, + variant_allele: str + }>, + seq_region_name: str, + start: int32, + strand: int32, + transcript_consequences: array, + distance: int32, + domains: array, + exon: str, + flags: str, + gene_id: str, + gene_pheno: int32, + gene_symbol: str, + gene_symbol_source: str, + hgnc_id: str, + hgvsc: str, + hgvsp: str, + hgvs_offset: int32, + impact: str, + intron: str, + lof: str, + lof_flags: str, + lof_filter: str, + lof_info: str, + mane_select: str, + mane_plus_clinical: str, + mirna: array, + polyphen_prediction: str, + polyphen_score: float64, + protein_end: int32, + protein_start: int32, + protein_id: str, + sift_prediction: str, + sift_score: float64, + source: str, + strand: int32, + transcript_id: str, + tsl: int32, + uniprot_isoform: array, + variant_allele: str + }>, + variant_class: str + } + 'rf': struct { + rf_positive_label: bool, + rf_negative_label: bool, + rf_label: str, + rf_train: bool, + rf_tp_probability: float64 + } + 'in_silico_predictors': struct { + cadd: struct { + phred: float32, + raw_score: float32, + has_duplicate: bool + }, + revel: struct { + revel_score: float64, + has_duplicate: bool + }, + splice_ai: struct { + splice_ai_score: float32, + splice_consequence: str, + has_duplicate: bool + }, + pangolin: struct { + pangolin_score: float64 + } + } + 'variant_id': str + 'colocated_variants': struct { + all: array, + non_ukb: array + } + 'gnomad': struct { + freq: struct { + all: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + }, + non_ukb: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + } + }, + faf95: struct { + popmax: float64, + popmax_population: str + }, + faf99: struct { + popmax: float64, + popmax_population: str + }, + age_distribution: struct { + het: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }>, + hom: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }> + }, + filters: set, + quality_metrics: struct { + allele_balance: struct { + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_depth: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_quality: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + site_quality_metrics: array + } + } + 'subsets': set + 'flags': set +---------------------------------------- +Key: ['locus', 'alleles'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_variants.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_variants.py index cbd421034..2b8314b8e 100644 --- a/data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_variants.py +++ b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_variants.py @@ -15,8 +15,8 @@ def freq_index_key(subset=None, pop=None, sex=None, raw=False): return "-".join(parts) -def prepare_gnomad_v4_variants(path): - ds = hl.read_table(path) +def prepare_gnomad_v4_variants(input_path): + ds = hl.read_table(input_path) g = hl.eval(ds.globals) subsets = set(m.get("subset", None) for m in g.freq_meta) @@ -84,7 +84,7 @@ def subset_filter(subset): ds = ds.annotate(in_autosome_or_par=ds.locus.in_autosome_or_par()) ds = ds.annotate( - exome=hl.struct( + gnomad=hl.struct( freq=hl.struct( **{ subset @@ -124,15 +124,15 @@ def subset_filter(subset): # If a variant is not present in a subset, do not store population frequencies for that subset ds = ds.annotate( - exome=ds.exome.annotate( - freq=ds.exome.freq.annotate( + gnomad=ds.gnomad.annotate( + freq=ds.gnomad.freq.annotate( **{ subset - or "all": ds.exome.freq[subset or "all"].annotate( + or "all": ds.gnomad.freq[subset or "all"].annotate( populations=hl.if_else( - ds.exome.freq[subset or "all"].ac_raw == 0, - hl.empty_array(ds.exome.freq[subset or "all"].populations.dtype.element_type), - ds.exome.freq[subset or "all"].populations, + ds.gnomad.freq[subset or "all"].ac_raw == 0, + hl.empty_array(ds.gnomad.freq[subset or "all"].populations.dtype.element_type), + ds.gnomad.freq[subset or "all"].populations, ) ) for subset in subsets @@ -149,7 +149,7 @@ def subset_filter(subset): ds = ds.annotate( subsets=hl.set( - hl.array([(subset, ds.exome.freq[subset].ac_raw > 0) for subset in subsets if subset is not None]) + hl.array([(subset, ds.gnomad.freq[subset].ac_raw > 0) for subset in subsets if subset is not None]) .filter(lambda t: t[1]) .map(lambda t: t[0]) ) @@ -163,7 +163,7 @@ def subset_filter(subset): # Get popmax FAFs ds = ds.annotate( - exome=ds.exome.annotate( + gnomad=ds.gnomad.annotate( faf95=hl.rbind( hl.sorted( hl.array( @@ -196,7 +196,7 @@ def subset_filter(subset): hl.struct(popmax=hl.null(hl.tfloat), popmax_population=hl.null(hl.tstr)), ), ), - ) + ), ) ds = ds.drop("faf") @@ -206,7 +206,7 @@ def subset_filter(subset): #################### ds = ds.annotate( - exome=ds.exome.annotate( + gnomad=ds.gnomad.annotate( age_distribution=hl.struct( het=ds.histograms.age_hists.age_hist_ht, hom=ds.histograms.age_hists.age_hist_hom ) @@ -218,7 +218,7 @@ def subset_filter(subset): ################### ds = ds.annotate( - exome=ds.exome.annotate( + gnomad=ds.gnomad.annotate( filters=ds.filters, quality_metrics=hl.struct( allele_balance=hl.struct( diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/initial_variant.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/initial_variant.py index 0e4e36978..2d810ef5a 100644 --- a/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/initial_variant.py +++ b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/initial_variant.py @@ -130,69 +130,69 @@ class Domain: @attr.define class TranscriptConsequence: - allele_num: int - amino_acids: str - appris: str - biotype: str + allele_num: Union[int, None] + amino_acids: Union[str, None] + appris: Union[str, None] + biotype: Union[str, None] canonical: Union[int, None] - ccds: str + ccds: Union[str, None] cdna_start: Union[int, None] cdna_end: Union[int, None] cds_end: Union[int, None] cds_start: Union[int, None] - codons: str + codons: Union[str, None] consequence_terms: List[str] - distance: int + distance: Union[int, None] domains: Union[List[Domain], None] - exon: str - flags: str - gene_id: str + exon: Union[str, None] + flags: Union[str, None] + gene_id: Union[str, None] gene_pheno: Union[int, None] - gene_symbol: str - gene_symbol_source: str - hgnc_id: str - hgvsc: str - hgvsp: str + gene_symbol: Union[str, None] + gene_symbol_source: Union[str, None] + hgnc_id: Union[str, None] + hgvsc: Union[str, None] + hgvsp: Union[str, None] hgvs_offset: Union[int, None] - impact: str - intron: str - lof: str - lof_flags: str - lof_filter: str - lof_info: str - mane_select: str - mane_plus_clinical: str + impact: Union[str, None] + intron: Union[str, None] + lof: Union[str, None] + lof_flags: Union[str, None] + lof_filter: Union[str, None] + lof_info: Union[str, None] + mane_select: Union[str, None] + mane_plus_clinical: Union[str, None] mirna: Union[List[str], None] - polyphen_prediction: str + polyphen_prediction: Union[str, None] polyphen_score: Union[float, None] protein_end: Union[int, None] protein_start: Union[int, None] - protein_id: str + protein_id: Union[str, None] sift_prediction: Union[str, None] sift_score: Union[float, None] - source: str - strand: int - transcript_id: str + source: Union[str, None] + strand: Union[int, None] + transcript_id: Union[str, None] tsl: Union[int, None] uniprot_isoform: Union[List[str], None] - variant_allele: str + variant_allele: Union[str, None] @attr.define class Vep: - allele_string: str - end: int - id: str - input: str + allele_string: Union[str, None] + end: Union[int, None] + id: Union[str, None] + input: Union[str, None] intergenic_consequences: Union[List[Consequence], None] - most_severe_consequence: str + most_severe_consequence: Union[str, None] motif_feature_consequences: Union[List[Consequence], None] regulatory_feature_consequences: Union[List[Consequence], None] - seq_region_name: str - start: int - strand: int + seq_region_name: Union[str, None] + start: Union[int, None] + strand: Union[int, None] transcript_consequences: List[TranscriptConsequence] - variant_class: str + variant_class: Union[str, None] @attr.define diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py index 0e53ea055..6da147d37 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py @@ -1,51 +1,64 @@ from data_pipeline.pipeline import Pipeline, run_pipeline -from data_pipeline.data_types.variant import annotate_variants, annotate_transcript_consequences - +from data_pipeline.config import config from data_pipeline.datasets.gnomad_v4.gnomad_v4_variants import prepare_gnomad_v4_variants -from data_pipeline.pipelines.gnomad_v4_coverage import pipeline as coverage_pipeline -from data_pipeline.pipelines.genes import pipeline as genes_pipeline +# from data_pipeline.data_types.variant import annotate_variants, annotate_transcript_consequences +# from data_pipeline.pipelines.gnomad_v4_coverage import pipeline as coverage_pipeline +# from data_pipeline.pipelines.genes import pipeline as genes_pipeline -pipeline = Pipeline() -pipeline.add_task( - "prepare_gnomad_v4_variants", - prepare_gnomad_v4_variants, - "/gnomad_v4/gnomad_v4_variants_base.ht", - {"path": "gs://gnomad-matt-data-pipeline/external_sources/2023-09-07-exome-variants-v4-mock/mock_v4_release.ht"}, -) +pipeline = Pipeline(name="gnomad_v4_variants") pipeline.add_task( - "annotate_gnomad_v4_variants", - annotate_variants, - "/gnomad_v4/gnomad_v4_variants_annotated_1.ht", - { - "variants_path": pipeline.get_task("prepare_gnomad_v4_variants"), - # We need to subset regions chr1:10030:10150 - "exome_coverage_path": coverage_pipeline.get_output("exome_coverage"), - "genome_coverage_path": coverage_pipeline.get_output("genome_coverage"), - # "caids_path": "gs://gnomad-browser-data-pipeline/caids/gnomad_v4_caids.ht", + name="prepare_gnomad_v4_exome_variants", + task_function=prepare_gnomad_v4_variants, + output_path="/gnomad_v4/gnomad_v4_exome_variants_base.ht", + inputs={ + "input_path": "external_datasets/mock_v4_release.ht", }, + # params={"sequencing_type": "exome"}, ) -pipeline.add_task( - "annotate_gnomad_v4_transcript_consequences", - annotate_transcript_consequences, - "/gnomad_v4/gnomad_v4_variants_annotated_2.ht", - { - "variants_path": pipeline.get_task("annotate_gnomad_v4_variants"), - "transcripts_path": genes_pipeline.get_output("base_transcripts_grch38"), - "mane_transcripts_path": genes_pipeline.get_output("mane_select_transcripts"), - }, -) +# pipeline.add_task( +# "prepare_gnomad_v4_genome_variants", +# prepare_gnomad_v4_variants, +# "/gnomad_v4/gnomad_v4_genome_variants_base.ht", +# { +# "path": "gs://gnomad-matt-data-pipeline/external_sources/2023-09-07-exome-variants-v4-mock/mock_v4_release.ht", +# "type": "genome", +# }, +# ) + +# pipeline.add_task( +# "annotate_gnomad_v4_variants", +# annotate_variants, +# "/gnomad_v4/gnomad_v4_variants_annotated_1.ht", +# { +# "variants_path": pipeline.get_task("prepare_gnomad_v4_exome_variants"), +# "exome_coverage_path": coverage_pipeline.get_output("exome_coverage"), +# "genome_coverage_path": coverage_pipeline.get_output("genome_coverage"), +# # "caids_path": "gs://gnomad-browser-data-pipeline/caids/gnomad_v4_caids.ht", +# }, +# ) + +# pipeline.add_task( +# "annotate_gnomad_v4_transcript_consequences", +# annotate_transcript_consequences, +# "/gnomad_v4/gnomad_v4_variants_annotated_2.ht", +# { +# "variants_path": pipeline.get_task("annotate_gnomad_v4_variants"), +# "transcripts_path": genes_pipeline.get_output("base_transcripts_grch38"), +# "mane_transcripts_path": genes_pipeline.get_output("mane_select_transcripts"), +# }, +# ) ############################################### # Outputs ############################################### -pipeline.set_outputs({"variants": "annotate_gnomad_v4_transcript_consequences"}) +# pipeline.set_outputs({"variants": "annotate_gnomad_v4_transcript_consequences"}) ############################################### # Run