diff --git a/Snakefile b/Snakefile index e088368..f1f01f0 100644 --- a/Snakefile +++ b/Snakefile @@ -131,6 +131,7 @@ rule all: f"work/annos/grch38/features/cons/{DV.ucsc_cons_38}/ucsc_conservation.tsv", f"work/annos/grch38/features/ensembl/{DV.ensembl_38}/ensembl_genes.bed.gz", f"work/annos/grch38/features/refseq/{DV.refseq_38}/refseq_genes.bed.gz", + # f"work/annos/grch38/features/clingen_dosage/{DV.today}/clingen_dosage_sensitivity_regions.bed.gz", # # == output directory =================================================================== # @@ -161,6 +162,9 @@ rule all: # ----- conservation f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY", f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY", + # ----- features + f"output/full/annonars/clingen-dosage-grch37/{DV.today}/clingen_region_curation_list.bed.gz", + f"output/full/annonars/clingen-dosage-grch38/{DV.today}/clingen_region_curation_list.bed.gz", # ----- genes f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.orphapacket}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY", # -- worker data @@ -339,6 +343,7 @@ include: "rules/work/genes/shet.smk" include: "rules/work/reference/human.smk" # Features (position and not variant specific). include: "rules/work/annos/features/cons.smk" +include: "rules/work/annos/features/clingen_dosage.smk" include: "rules/work/annos/features/ensembl.smk" include: "rules/work/annos/features/refseq.smk" include: "rules/work/annos/features/tads.smk" @@ -363,7 +368,7 @@ include: "rules/work/annos/strucvars/clinvar.smk" include: "rules/output/mehari/freqs.smk" # ---- viguno include: "rules/output/viguno/hpo.smk" -# ------ annonars +# ---- annonars include: "rules/output/annonars/cadd.smk" include: "rules/output/annonars/cons.smk" include: "rules/output/annonars/dbnsfp.smk" @@ -374,6 +379,8 @@ include: "rules/output/annonars/gnomad_genomes.smk" include: "rules/output/annonars/gnomad_mtdna.smk" include: "rules/output/annonars/helix.smk" include: "rules/output/annonars/genes.smk" +# ------ features +include: "rules/output/annonars/clingen_dosage.smk" # ---- worker include: "rules/output/worker/patho_mms.smk" include: "rules/output/worker/clinvar.smk" diff --git a/download_urls.yml b/download_urls.yml index 541e3cc..a200799 100644 --- a/download_urls.yml +++ b/download_urls.yml @@ -1,3 +1,6 @@ +- url: ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh37.tsv +- url: ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh38.tsv + - url: https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt excerpt_strategy: strategy: no-excerpt diff --git a/excerpt-data/796c631dc892eda6/ClinGen_region_curation_list_GRCh38.tsv b/excerpt-data/796c631dc892eda6/ClinGen_region_curation_list_GRCh38.tsv new file mode 100644 index 0000000..f24e604 --- /dev/null +++ b/excerpt-data/796c631dc892eda6/ClinGen_region_curation_list_GRCh38.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f31a9b57fa53dc4158bd91a5c998cbaed38fef695d5f146d73bc9b5ca2787412 +size 18834 diff --git a/excerpt-data/796c631dc892eda6/url.txt b/excerpt-data/796c631dc892eda6/url.txt new file mode 100644 index 0000000..07f853b --- /dev/null +++ b/excerpt-data/796c631dc892eda6/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74891f7b36b2b5162805ce07c2e356cbe4a8f3339c60a53fe3b7625a13ffbae5 +size 69 diff --git a/excerpt-data/e424cce724cdc500/ClinGen_region_curation_list_GRCh37.tsv b/excerpt-data/e424cce724cdc500/ClinGen_region_curation_list_GRCh37.tsv new file mode 100644 index 0000000..62fb83b --- /dev/null +++ b/excerpt-data/e424cce724cdc500/ClinGen_region_curation_list_GRCh37.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ba58ad6c8581290680a14c053d7d710eca26eea2e65e784ba3725ca9f2fa9e0 +size 18876 diff --git a/excerpt-data/e424cce724cdc500/url.txt b/excerpt-data/e424cce724cdc500/url.txt new file mode 100644 index 0000000..08a3bf7 --- /dev/null +++ b/excerpt-data/e424cce724cdc500/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f333e333e4b40baa09669aaee6c340107415fd83ada5d3076b89eb7452e1fcb2 +size 69 diff --git a/rules/output/annonars/clingen_dosage.smk b/rules/output/annonars/clingen_dosage.smk new file mode 100644 index 0000000..70c0951 --- /dev/null +++ b/rules/output/annonars/clingen_dosage.smk @@ -0,0 +1,48 @@ +## Output rules related to ClinGen dosage sensitivity regions. + + +rule annos_features_clingen_dosage_download_to_bed: # -- convert ClinGen dosage sensitivity to BEd + input: + tsv="work/download/annos/{genome_release}/features/clingen_dosage/{date}/clingen_region_curation_list.tsv", + output: + bed="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.bed.gz", + bed_md5="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.bed.gz.md5", + bed_tbi="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.bed.gz.tbi", + bed_tbi_md5="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.bed.gz.tbi.md5", + spec_yaml="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.spec.yaml", + shell: + r""" + if [[ "{wildcards.genome_release}" == "grch37" ]]; then + chr_prefix= + else + chr_prefix=chr + fi + + tail -n +8 {input.tsv} \ + | awk -v chr_prefix=$chr_prefix -F $'\t' 'BEGIN {{ OFS=FS }} + {{ + if ($4 == "tbd") {{ + next; /* skip, unmatched region */ + }} + + region=$4; + split($4, a, /[:-]/); + sub(/^chr/, "", a[1]); + print chr_prefix a[1], a[2] - 1, a[3], $0; + }}' \ + | LC_ALL=C sort -k1,1V -k2,2n \ + | bgzip -c \ + > {output.bed} + tabix -f {output.bed} + + md5sum {output.bed} > {output.bed_md5} + md5sum {output.bed_tbi} > {output.bed_tbi_md5} + + varfish-db-downloader tpl \ + --template rules/output/annonars/clingen_dosage.spec.yaml \ + --value today={wildcards.date} \ + --value genome_release={wildcards.genome_release} \ + \ + --value v_downloader={PV.downloader} \ + > {output.spec_yaml} + """ diff --git a/rules/output/annonars/clingen_dosage.spec.yaml b/rules/output/annonars/clingen_dosage.spec.yaml new file mode 100644 index 0000000..482d1c6 --- /dev/null +++ b/rules/output/annonars/clingen_dosage.spec.yaml @@ -0,0 +1,13 @@ +dc.identifier: annonars/features/clingen-dosage:{{ version }}-{{ genome_release }} +dc.title: ClinGen Dosage Sensitivity +dc.creator: NCBI ClinGen Team +dc.contributor: + - VarFish Developer Teams +dc.format: application/x-bed +dc.date: {{ today }} +x-version: {{ today }} +x-genome-release: {{ genome_release }} +dc.description: | + BED file with ClinGen dosage sensitivity regions. +dc.source: + - https://search.clinicalgenome.org/kb/downloads#section_dosage diff --git a/rules/work/annos/features/clingen_dosage.smk b/rules/work/annos/features/clingen_dosage.smk new file mode 100644 index 0000000..6e2addd --- /dev/null +++ b/rules/work/annos/features/clingen_dosage.smk @@ -0,0 +1,26 @@ +## Work rules related to ClinGen dosage sensitivity regions. + + +rule annos_features_clingen_dosage_download: # -- download ClinGen dosage sensitivity + output: + tsv="work/download/annos/{genome_release}/features/clingen_dosage/{date}/clingen_region_curation_list.tsv", + tsv_md5="work/download/annos/{genome_release}/features/clingen_dosage/{date}/clingen_region_curation_list.tsv.md5", + shell: + r""" + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + + if [[ "{wildcards.genome_release}" == "grch37" ]]; then + URL_RELEASE=GRCh37 + else + URL_RELEASE=GRCh38 + fi + + wget --no-check-certificate \ + -O {output.tsv} \ + ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_${{URL_RELEASE}}.tsv + + md5sum {output.tsv} > {output.tsv_md5} + """