Skip to content

Commit

Permalink
feat: updating annonars genes with ClinGen dosage & DOMINO (#65)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Nov 22, 2023
1 parent bbcf2f2 commit 0797f2d
Show file tree
Hide file tree
Showing 18 changed files with 62 additions and 122 deletions.
13 changes: 5 additions & 8 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ rule all:
# == work directory =====================================================================
#
# genes
f"work/download/genes/clingen/{DV.clingen_gene}/clingen.csv",
f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz",
f"work/download/genes/orphapacket/{DV.orphapacket}/orphapacket.tar.gz",
f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz",
Expand All @@ -99,6 +98,9 @@ rule all:
f"work/genes/orphapacket/{DV.orphapacket}+{DV.today}/orpha_diseases.tsv",
"work/genes/rcnv/2022/rcnv_collins_2022.tsv",
"work/genes/shet/2019/shet_weghorn_2019.tsv",
f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh37.tsv",
f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh38.tsv",
"work/genes/domino/20190219/domino.tsv",
# reference-specific annotations
# -- background/population sequence variants and annotations thereof
# ---- GRCh37
Expand Down Expand Up @@ -132,7 +134,6 @@ rule all:
f"work/annos/grch38/features/cons/{DV.ucsc_cons_38}/ucsc_conservation.tsv",
f"work/annos/grch38/features/ensembl/{DV.ensembl_38}/ensembl_genes.bed.gz",
f"work/annos/grch38/features/refseq/{DV.refseq_38}/refseq_genes.bed.gz",
# f"work/annos/grch38/features/clingen_dosage/{DV.today}/clingen_dosage_sensitivity_regions.bed.gz",
#
# == output directory ===================================================================
#
Expand Down Expand Up @@ -163,9 +164,6 @@ rule all:
# ----- conservation
f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
# ----- features
f"output/full/annonars/clingen-dosage-grch37/{DV.today}/clingen_region_curation_list.bed.gz",
f"output/full/annonars/clingen-dosage-grch38/{DV.today}/clingen_region_curation_list.bed.gz",
# ----- genes
f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.orphapacket}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
# -- worker data
Expand Down Expand Up @@ -341,11 +339,12 @@ include: "rules/work/genes/omim.smk"
include: "rules/work/genes/orphapacket.smk"
include: "rules/work/genes/rcnv.smk"
include: "rules/work/genes/shet.smk"
include: "rules/work/genes/domino.smk"
include: "rules/work/genes/clingen.smk"
# Reference sequence--related rules.
include: "rules/work/reference/human.smk"
# Features (position and not variant specific).
include: "rules/work/annos/features/cons.smk"
include: "rules/work/annos/features/clingen_dosage.smk"
include: "rules/work/annos/features/ensembl.smk"
include: "rules/work/annos/features/refseq.smk"
include: "rules/work/annos/features/tads.smk"
Expand Down Expand Up @@ -381,8 +380,6 @@ include: "rules/output/annonars/gnomad_genomes.smk"
include: "rules/output/annonars/gnomad_mtdna.smk"
include: "rules/output/annonars/helix.smk"
include: "rules/output/annonars/genes.smk"
# ------ features
include: "rules/output/annonars/clingen_dosage.smk"
# ---- worker
include: "rules/output/worker/patho_mms.smk"
include: "rules/output/worker/clinvar.smk"
Expand Down
3 changes: 3 additions & 0 deletions bundled-data/domino/get.sh
Git LFS file not shown
3 changes: 3 additions & 0 deletions bundled-data/domino/score_all_final_19.02.19.txt
Git LFS file not shown
5 changes: 2 additions & 3 deletions download_urls.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

- url: ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh37.tsv
- url: ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh38.tsv
- url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh37.tsv
- url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh38.tsv

- url: https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
excerpt_strategy:
Expand All @@ -23,9 +25,6 @@
- comment: The curation activity summary report is built in real-time.
url: https://search.clinicalgenome.org/kb/reports/curation-activity-summary-report

- comment: ClinGen variant summary is built in real-time
url: http://erepo.clinicalgenome.org/evrepo/api/classifications/all?format=tabbed

- url: https://github.com/bihealth/annonars-data-clinvar/releases/download/clinvar-weekly-20230625/clinvar-strucvar-grch37-2023-0625+0.6.3.tar.gz
excerpt_strategy:
strategy: no-excerpt
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies:
# Parallel (de)compression.
- pigz
# Varfish related
- annonars =0.18.0
- annonars =0.27.0
- viguno =0.1.6
- mehari =0.18.1
- varfish-server-worker =0.10.1
Expand Down
3 changes: 0 additions & 3 deletions excerpt-data/0a27656c7f2ba08a/all

This file was deleted.

3 changes: 0 additions & 3 deletions excerpt-data/0a27656c7f2ba08a/url.txt

This file was deleted.

Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/0e7eb7069eb4d354/url.txt
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/cb35d21dd121ab4b/url.txt
Git LFS file not shown
48 changes: 0 additions & 48 deletions rules/output/annonars/clingen_dosage.smk

This file was deleted.

13 changes: 0 additions & 13 deletions rules/output/annonars/clingen_dosage.spec.yaml

This file was deleted.

15 changes: 7 additions & 8 deletions rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
rule output_annonars_genes: # -- build annonars genes RocksDB file
input:
acmg_sf="data/acmg_sf/{v_acmg_sf}/acmg_sf.tsv",
clingen="work/download/genes/clingen/{date}/clingen.csv",
clingen_37="work/genes/clingen/{date}/ClinGen_gene_curation_list_GRCh37.tsv",
clingen_38="work/genes/clingen/{date}/ClinGen_gene_curation_list_GRCh38.tsv",
gnomad_constraints="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv",
dbnsfp="work/genes/dbnsfp/{v_dbnsfp}/genes.tsv.gz",
hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl",
Expand All @@ -14,6 +15,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
domino="work/genes/domino/20190219/domino.tsv",
output:
rocksdb_identity=(
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/"
Expand All @@ -31,15 +33,11 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
v_annonars=RE_VERSION,
shell:
r"""
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT
tail -n +4 {input.clingen} > $TMPDIR/clingen.csv
annonars gene import \
--path-out-rocksdb $(dirname {output.rocksdb_identity}) \
--path-in-acmg {input.acmg_sf} \
--path-in-clingen $TMPDIR/clingen.csv \
--path-in-clingen-37 {input.clingen_37} \
--path-in-clingen-38 {input.clingen_38} \
--path-in-gnomad-constraints {input.gnomad_constraints} \
--path-in-dbnsfp {input.dbnsfp} \
--path-in-hgnc {input.hgnc} \
Expand All @@ -48,7 +46,8 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
--path-in-ncbi {input.ncbi} \
--path-in-rcnv {input.rcnv} \
--path-in-shet {input.shet} \
--path-in-gtex {input.gtex}
--path-in-gtex {input.gtex} \
--path-in-domino {input.domino}
varfish-db-downloader tpl \
--template rules/output/annonars/genes.spec.yaml \
Expand Down
7 changes: 6 additions & 1 deletion rules/output/annonars/genes.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,19 @@ dc.description: |
- dbNSFP v{{ v_dbnsfp }}
- NCBI gene database downloaded at {{ today }}
- HGNC database downloaded at {{ today }}
- ClinGen Gene Dosage pathogenicity {{ today }}
- DOMINO data v2019-02-19
dc.source:
- PMID:35802134
- PMID:32461654
- PMID:33261662
- PMID:28985496
- https://www.ncbi.nlm.nih.gov/gene/
- https://www.genenames.org/
x-created-from:
- name: ACMG SF Gene List
version: {{ v_acmg_sf }}
- name: ClinGen Gene Curation
- name: ClinGen Gene Dosage pathog
version: {{ today }}
- name: gnomAD constraints
version: {{ v_gnomad_constraints }}
Expand All @@ -42,3 +45,5 @@ x-created-from:
version: 2019-Weghorn-et-a.
- name: GTex data
version: v8
- name: DOMINO data
version: "2019-02-19"
26 changes: 0 additions & 26 deletions rules/work/annos/features/clingen_dosage.smk

This file was deleted.

15 changes: 7 additions & 8 deletions rules/work/genes/clingen.smk
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
## Rules related to ClinGen curation download.
## Rules related to ClinGen gene dosage pathogenicity annotation.


rule genes_clingen_download: # -- download ClinGen curations
rule clingen_gene_download: # -- download files
output:
csv="work/download/genes/clingen/{date}/clingen.csv",
csv_md5="work/download/genes/clingen/{date}/clingen.csv.md5",
tsv="work/genes/clingen/{date}/ClinGen_gene_curation_list_{genome_release}.tsv",
tsv_md5="work/genes/clingen/{date}/ClinGen_gene_curation_list_{genome_release}.tsv.md5",
shell:
r"""
if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
>&2 echo "{wildcards.date} is not today"
exit 1
fi
wget --no-check-certificate \
-O {output.csv} \
https://search.clinicalgenome.org/kb/reports/curation-activity-summary-report
wget -O {output.tsv} \
ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_{wildcards.genome_release}.tsv
md5sum {output.csv} > {output.csv_md5}
md5sum {output.tsv} > {output.tsv}.md5
"""
16 changes: 16 additions & 0 deletions rules/work/genes/domino.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
## Rules related to DOMINO gene annotation.


rule genes_domino: # -- postprocess file for import
input:
tsv="bundled-data/domino/score_all_final_19.02.19.txt",
output:
tsv="work/genes/domino/20190219/domino.tsv",
tsv_md5="work/genes/domino/20190219/domino.tsv.md5",
shell:
"""
cut -f 1-2 {input.tsv} \
> {output.tsv}
md5sum {output.tsv} > {output.tsv}.md5
"""

0 comments on commit 0797f2d

Please sign in to comment.