From 0797f2dcfd3e0d639c6c0648b65ac326df8c81d7 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Wed, 22 Nov 2023 08:45:17 +0100 Subject: [PATCH] feat: updating annonars genes with ClinGen dosage & DOMINO (#65) --- Snakefile | 13 ++--- bundled-data/domino/get.sh | 3 ++ .../domino/score_all_final_19.02.19.txt | 3 ++ download_urls.yml | 5 +- environment.yml | 2 +- excerpt-data/0a27656c7f2ba08a/all | 3 -- excerpt-data/0a27656c7f2ba08a/url.txt | 3 -- .../ClinGen_gene_curation_list_GRCh37.tsv | 3 ++ excerpt-data/0e7eb7069eb4d354/url.txt | 3 ++ .../ClinGen_gene_curation_list_GRCh38.tsv | 3 ++ excerpt-data/cb35d21dd121ab4b/url.txt | 3 ++ rules/output/annonars/clingen_dosage.smk | 48 ------------------- .../output/annonars/clingen_dosage.spec.yaml | 13 ----- rules/output/annonars/genes.smk | 15 +++--- rules/output/annonars/genes.spec.yaml | 7 ++- rules/work/annos/features/clingen_dosage.smk | 26 ---------- rules/work/genes/clingen.smk | 15 +++--- rules/work/genes/domino.smk | 16 +++++++ 18 files changed, 62 insertions(+), 122 deletions(-) create mode 100644 bundled-data/domino/get.sh create mode 100644 bundled-data/domino/score_all_final_19.02.19.txt delete mode 100644 excerpt-data/0a27656c7f2ba08a/all delete mode 100644 excerpt-data/0a27656c7f2ba08a/url.txt create mode 100644 excerpt-data/0e7eb7069eb4d354/ClinGen_gene_curation_list_GRCh37.tsv create mode 100644 excerpt-data/0e7eb7069eb4d354/url.txt create mode 100644 excerpt-data/cb35d21dd121ab4b/ClinGen_gene_curation_list_GRCh38.tsv create mode 100644 excerpt-data/cb35d21dd121ab4b/url.txt delete mode 100644 rules/output/annonars/clingen_dosage.smk delete mode 100644 rules/output/annonars/clingen_dosage.spec.yaml delete mode 100644 rules/work/annos/features/clingen_dosage.smk create mode 100644 rules/work/genes/domino.smk diff --git a/Snakefile b/Snakefile index df6b033..8f250d4 100644 --- a/Snakefile +++ b/Snakefile @@ -85,7 +85,6 @@ rule all: # == work directory ===================================================================== # # genes - f"work/download/genes/clingen/{DV.clingen_gene}/clingen.csv", f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz", f"work/download/genes/orphapacket/{DV.orphapacket}/orphapacket.tar.gz", f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz", @@ -99,6 +98,9 @@ rule all: f"work/genes/orphapacket/{DV.orphapacket}+{DV.today}/orpha_diseases.tsv", "work/genes/rcnv/2022/rcnv_collins_2022.tsv", "work/genes/shet/2019/shet_weghorn_2019.tsv", + f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh37.tsv", + f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh38.tsv", + "work/genes/domino/20190219/domino.tsv", # reference-specific annotations # -- background/population sequence variants and annotations thereof # ---- GRCh37 @@ -132,7 +134,6 @@ rule all: f"work/annos/grch38/features/cons/{DV.ucsc_cons_38}/ucsc_conservation.tsv", f"work/annos/grch38/features/ensembl/{DV.ensembl_38}/ensembl_genes.bed.gz", f"work/annos/grch38/features/refseq/{DV.refseq_38}/refseq_genes.bed.gz", - # f"work/annos/grch38/features/clingen_dosage/{DV.today}/clingen_dosage_sensitivity_regions.bed.gz", # # == output directory =================================================================== # @@ -163,9 +164,6 @@ rule all: # ----- conservation f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY", f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY", - # ----- features - f"output/full/annonars/clingen-dosage-grch37/{DV.today}/clingen_region_curation_list.bed.gz", - f"output/full/annonars/clingen-dosage-grch38/{DV.today}/clingen_region_curation_list.bed.gz", # ----- genes f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.orphapacket}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY", # -- worker data @@ -341,11 +339,12 @@ include: "rules/work/genes/omim.smk" include: "rules/work/genes/orphapacket.smk" include: "rules/work/genes/rcnv.smk" include: "rules/work/genes/shet.smk" +include: "rules/work/genes/domino.smk" +include: "rules/work/genes/clingen.smk" # Reference sequence--related rules. include: "rules/work/reference/human.smk" # Features (position and not variant specific). include: "rules/work/annos/features/cons.smk" -include: "rules/work/annos/features/clingen_dosage.smk" include: "rules/work/annos/features/ensembl.smk" include: "rules/work/annos/features/refseq.smk" include: "rules/work/annos/features/tads.smk" @@ -381,8 +380,6 @@ include: "rules/output/annonars/gnomad_genomes.smk" include: "rules/output/annonars/gnomad_mtdna.smk" include: "rules/output/annonars/helix.smk" include: "rules/output/annonars/genes.smk" -# ------ features -include: "rules/output/annonars/clingen_dosage.smk" # ---- worker include: "rules/output/worker/patho_mms.smk" include: "rules/output/worker/clinvar.smk" diff --git a/bundled-data/domino/get.sh b/bundled-data/domino/get.sh new file mode 100644 index 0000000..bfb1d6b --- /dev/null +++ b/bundled-data/domino/get.sh @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48bd163d94781c41cb31a33bbad006f5233202758346037685fda4a018ed8a75 +size 108 diff --git a/bundled-data/domino/score_all_final_19.02.19.txt b/bundled-data/domino/score_all_final_19.02.19.txt new file mode 100644 index 0000000..3f1ef9f --- /dev/null +++ b/bundled-data/domino/score_all_final_19.02.19.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e8559bcc8078f0312d69981813a2dc6968664e0ce4392984e93e76e30c47200 +size 2379134 diff --git a/download_urls.yml b/download_urls.yml index 09180ae..07e8c50 100644 --- a/download_urls.yml +++ b/download_urls.yml @@ -2,6 +2,8 @@ - url: ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh37.tsv - url: ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh38.tsv +- url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh37.tsv +- url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh38.tsv - url: https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt excerpt_strategy: @@ -23,9 +25,6 @@ - comment: The curation activity summary report is built in real-time. url: https://search.clinicalgenome.org/kb/reports/curation-activity-summary-report -- comment: ClinGen variant summary is built in real-time - url: http://erepo.clinicalgenome.org/evrepo/api/classifications/all?format=tabbed - - url: https://github.com/bihealth/annonars-data-clinvar/releases/download/clinvar-weekly-20230625/clinvar-strucvar-grch37-2023-0625+0.6.3.tar.gz excerpt_strategy: strategy: no-excerpt diff --git a/environment.yml b/environment.yml index a9afb28..b588807 100644 --- a/environment.yml +++ b/environment.yml @@ -41,7 +41,7 @@ dependencies: # Parallel (de)compression. - pigz # Varfish related - - annonars =0.18.0 + - annonars =0.27.0 - viguno =0.1.6 - mehari =0.18.1 - varfish-server-worker =0.10.1 diff --git a/excerpt-data/0a27656c7f2ba08a/all b/excerpt-data/0a27656c7f2ba08a/all deleted file mode 100644 index a8084b9..0000000 --- a/excerpt-data/0a27656c7f2ba08a/all +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8dbeb7d6578a59b07d5296ba8f8d0f729f51dfd005cd7c3466a4c683e7a4d91c -size 156424 diff --git a/excerpt-data/0a27656c7f2ba08a/url.txt b/excerpt-data/0a27656c7f2ba08a/url.txt deleted file mode 100644 index 1a8f3a9..0000000 --- a/excerpt-data/0a27656c7f2ba08a/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cce44490e7ce19c99ae46d09281eca6645ca05c05bfe32c8ff87b88a25ca856c -size 77 diff --git a/excerpt-data/0e7eb7069eb4d354/ClinGen_gene_curation_list_GRCh37.tsv b/excerpt-data/0e7eb7069eb4d354/ClinGen_gene_curation_list_GRCh37.tsv new file mode 100644 index 0000000..b66c9d7 --- /dev/null +++ b/excerpt-data/0e7eb7069eb4d354/ClinGen_gene_curation_list_GRCh37.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e818e7ae751c36013377640bf29093a77d08a53eefd5adb6c3e769b982351dfc +size 15862 diff --git a/excerpt-data/0e7eb7069eb4d354/url.txt b/excerpt-data/0e7eb7069eb4d354/url.txt new file mode 100644 index 0000000..4852a6f --- /dev/null +++ b/excerpt-data/0e7eb7069eb4d354/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6cde2620cd50f150d2e0974eae1676e0ae4fe6691c86721a89fa5cab8c5f256 +size 67 diff --git a/excerpt-data/cb35d21dd121ab4b/ClinGen_gene_curation_list_GRCh38.tsv b/excerpt-data/cb35d21dd121ab4b/ClinGen_gene_curation_list_GRCh38.tsv new file mode 100644 index 0000000..110f271 --- /dev/null +++ b/excerpt-data/cb35d21dd121ab4b/ClinGen_gene_curation_list_GRCh38.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c401cc2cb6aeda93c1d1b633603ffb6ea15efc049694e18bdfd21894308e4f6 +size 15860 diff --git a/excerpt-data/cb35d21dd121ab4b/url.txt b/excerpt-data/cb35d21dd121ab4b/url.txt new file mode 100644 index 0000000..0b0ff4d --- /dev/null +++ b/excerpt-data/cb35d21dd121ab4b/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5507c499a3f71a87d1f2d8bd1b32331c399baf65b41f0e773b6ac80f92ed2d7d +size 67 diff --git a/rules/output/annonars/clingen_dosage.smk b/rules/output/annonars/clingen_dosage.smk deleted file mode 100644 index 70c0951..0000000 --- a/rules/output/annonars/clingen_dosage.smk +++ /dev/null @@ -1,48 +0,0 @@ -## Output rules related to ClinGen dosage sensitivity regions. - - -rule annos_features_clingen_dosage_download_to_bed: # -- convert ClinGen dosage sensitivity to BEd - input: - tsv="work/download/annos/{genome_release}/features/clingen_dosage/{date}/clingen_region_curation_list.tsv", - output: - bed="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.bed.gz", - bed_md5="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.bed.gz.md5", - bed_tbi="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.bed.gz.tbi", - bed_tbi_md5="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.bed.gz.tbi.md5", - spec_yaml="output/full/annonars/clingen-dosage-{genome_release}/{date}/clingen_region_curation_list.spec.yaml", - shell: - r""" - if [[ "{wildcards.genome_release}" == "grch37" ]]; then - chr_prefix= - else - chr_prefix=chr - fi - - tail -n +8 {input.tsv} \ - | awk -v chr_prefix=$chr_prefix -F $'\t' 'BEGIN {{ OFS=FS }} - {{ - if ($4 == "tbd") {{ - next; /* skip, unmatched region */ - }} - - region=$4; - split($4, a, /[:-]/); - sub(/^chr/, "", a[1]); - print chr_prefix a[1], a[2] - 1, a[3], $0; - }}' \ - | LC_ALL=C sort -k1,1V -k2,2n \ - | bgzip -c \ - > {output.bed} - tabix -f {output.bed} - - md5sum {output.bed} > {output.bed_md5} - md5sum {output.bed_tbi} > {output.bed_tbi_md5} - - varfish-db-downloader tpl \ - --template rules/output/annonars/clingen_dosage.spec.yaml \ - --value today={wildcards.date} \ - --value genome_release={wildcards.genome_release} \ - \ - --value v_downloader={PV.downloader} \ - > {output.spec_yaml} - """ diff --git a/rules/output/annonars/clingen_dosage.spec.yaml b/rules/output/annonars/clingen_dosage.spec.yaml deleted file mode 100644 index 482d1c6..0000000 --- a/rules/output/annonars/clingen_dosage.spec.yaml +++ /dev/null @@ -1,13 +0,0 @@ -dc.identifier: annonars/features/clingen-dosage:{{ version }}-{{ genome_release }} -dc.title: ClinGen Dosage Sensitivity -dc.creator: NCBI ClinGen Team -dc.contributor: - - VarFish Developer Teams -dc.format: application/x-bed -dc.date: {{ today }} -x-version: {{ today }} -x-genome-release: {{ genome_release }} -dc.description: | - BED file with ClinGen dosage sensitivity regions. -dc.source: - - https://search.clinicalgenome.org/kb/downloads#section_dosage diff --git a/rules/output/annonars/genes.smk b/rules/output/annonars/genes.smk index b4d5f57..729e9fd 100644 --- a/rules/output/annonars/genes.smk +++ b/rules/output/annonars/genes.smk @@ -4,7 +4,8 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file input: acmg_sf="data/acmg_sf/{v_acmg_sf}/acmg_sf.tsv", - clingen="work/download/genes/clingen/{date}/clingen.csv", + clingen_37="work/genes/clingen/{date}/ClinGen_gene_curation_list_GRCh37.tsv", + clingen_38="work/genes/clingen/{date}/ClinGen_gene_curation_list_GRCh38.tsv", gnomad_constraints="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv", dbnsfp="work/genes/dbnsfp/{v_dbnsfp}/genes.tsv.gz", hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl", @@ -14,6 +15,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv", shet="work/genes/shet/2019/shet_weghorn_2019.tsv", gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz", + domino="work/genes/domino/20190219/domino.tsv", output: rocksdb_identity=( "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/" @@ -31,15 +33,11 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file v_annonars=RE_VERSION, shell: r""" - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - tail -n +4 {input.clingen} > $TMPDIR/clingen.csv - annonars gene import \ --path-out-rocksdb $(dirname {output.rocksdb_identity}) \ --path-in-acmg {input.acmg_sf} \ - --path-in-clingen $TMPDIR/clingen.csv \ + --path-in-clingen-37 {input.clingen_37} \ + --path-in-clingen-38 {input.clingen_38} \ --path-in-gnomad-constraints {input.gnomad_constraints} \ --path-in-dbnsfp {input.dbnsfp} \ --path-in-hgnc {input.hgnc} \ @@ -48,7 +46,8 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file --path-in-ncbi {input.ncbi} \ --path-in-rcnv {input.rcnv} \ --path-in-shet {input.shet} \ - --path-in-gtex {input.gtex} + --path-in-gtex {input.gtex} \ + --path-in-domino {input.domino} varfish-db-downloader tpl \ --template rules/output/annonars/genes.spec.yaml \ diff --git a/rules/output/annonars/genes.spec.yaml b/rules/output/annonars/genes.spec.yaml index 6a40472..e4cd860 100644 --- a/rules/output/annonars/genes.spec.yaml +++ b/rules/output/annonars/genes.spec.yaml @@ -13,16 +13,19 @@ dc.description: | - dbNSFP v{{ v_dbnsfp }} - NCBI gene database downloaded at {{ today }} - HGNC database downloaded at {{ today }} + - ClinGen Gene Dosage pathogenicity {{ today }} + - DOMINO data v2019-02-19 dc.source: - PMID:35802134 - PMID:32461654 - PMID:33261662 + - PMID:28985496 - https://www.ncbi.nlm.nih.gov/gene/ - https://www.genenames.org/ x-created-from: - name: ACMG SF Gene List version: {{ v_acmg_sf }} - - name: ClinGen Gene Curation + - name: ClinGen Gene Dosage pathog version: {{ today }} - name: gnomAD constraints version: {{ v_gnomad_constraints }} @@ -42,3 +45,5 @@ x-created-from: version: 2019-Weghorn-et-a. - name: GTex data version: v8 + - name: DOMINO data + version: "2019-02-19" diff --git a/rules/work/annos/features/clingen_dosage.smk b/rules/work/annos/features/clingen_dosage.smk deleted file mode 100644 index 6e2addd..0000000 --- a/rules/work/annos/features/clingen_dosage.smk +++ /dev/null @@ -1,26 +0,0 @@ -## Work rules related to ClinGen dosage sensitivity regions. - - -rule annos_features_clingen_dosage_download: # -- download ClinGen dosage sensitivity - output: - tsv="work/download/annos/{genome_release}/features/clingen_dosage/{date}/clingen_region_curation_list.tsv", - tsv_md5="work/download/annos/{genome_release}/features/clingen_dosage/{date}/clingen_region_curation_list.tsv.md5", - shell: - r""" - if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then - >&2 echo "{wildcards.date} is not today" - exit 1 - fi - - if [[ "{wildcards.genome_release}" == "grch37" ]]; then - URL_RELEASE=GRCh37 - else - URL_RELEASE=GRCh38 - fi - - wget --no-check-certificate \ - -O {output.tsv} \ - ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_${{URL_RELEASE}}.tsv - - md5sum {output.tsv} > {output.tsv_md5} - """ diff --git a/rules/work/genes/clingen.smk b/rules/work/genes/clingen.smk index fa1e9ac..d075f43 100644 --- a/rules/work/genes/clingen.smk +++ b/rules/work/genes/clingen.smk @@ -1,10 +1,10 @@ -## Rules related to ClinGen curation download. +## Rules related to ClinGen gene dosage pathogenicity annotation. -rule genes_clingen_download: # -- download ClinGen curations +rule clingen_gene_download: # -- download files output: - csv="work/download/genes/clingen/{date}/clingen.csv", - csv_md5="work/download/genes/clingen/{date}/clingen.csv.md5", + tsv="work/genes/clingen/{date}/ClinGen_gene_curation_list_{genome_release}.tsv", + tsv_md5="work/genes/clingen/{date}/ClinGen_gene_curation_list_{genome_release}.tsv.md5", shell: r""" if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then @@ -12,9 +12,8 @@ rule genes_clingen_download: # -- download ClinGen curations exit 1 fi - wget --no-check-certificate \ - -O {output.csv} \ - https://search.clinicalgenome.org/kb/reports/curation-activity-summary-report + wget -O {output.tsv} \ + ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_{wildcards.genome_release}.tsv - md5sum {output.csv} > {output.csv_md5} + md5sum {output.tsv} > {output.tsv}.md5 """ diff --git a/rules/work/genes/domino.smk b/rules/work/genes/domino.smk new file mode 100644 index 0000000..0bce961 --- /dev/null +++ b/rules/work/genes/domino.smk @@ -0,0 +1,16 @@ +## Rules related to DOMINO gene annotation. + + +rule genes_domino: # -- postprocess file for import + input: + tsv="bundled-data/domino/score_all_final_19.02.19.txt", + output: + tsv="work/genes/domino/20190219/domino.tsv", + tsv_md5="work/genes/domino/20190219/domino.tsv.md5", + shell: + """ + cut -f 1-2 {input.tsv} \ + > {output.tsv} + + md5sum {output.tsv} > {output.tsv}.md5 + """