Skip to content

Commit

Permalink
feat: build annonars regions (clingen dosage) (#67)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Nov 22, 2023
1 parent 3fd72dd commit 664295d
Show file tree
Hide file tree
Showing 11 changed files with 183 additions and 0 deletions.
7 changes: 7 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,11 @@ rule all:
f"output/full/annonars/gnomad-sv-exomes-grch38-{DV.gnomad_cnv4}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/gnomad-sv-genomes-grch37-{DV.gnomad_sv}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/gnomad-sv-genomes-grch38-{DV.gnomad_sv4}+{PV.annonars}/rocksdb/IDENTITY",
# ----- sequence annotation
f"output/full/annonars/functional-grch37-{DV.refseq_fe_37}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/functional-grch38-{DV.refseq_fe_38}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/regions-grch37-{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/regions-grch38-{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
# ----- conservation
f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
Expand Down Expand Up @@ -386,6 +391,8 @@ include: "rules/output/annonars/gnomad_mtdna.smk"
include: "rules/output/annonars/gnomad_sv.smk"
include: "rules/output/annonars/helix.smk"
include: "rules/output/annonars/genes.smk"
include: "rules/output/annonars/functional.smk"
include: "rules/output/annonars/regions.smk"
# ---- worker
include: "rules/output/worker/patho_mms.smk"
include: "rules/output/worker/clinvar.smk"
Expand Down
9 changes: 9 additions & 0 deletions download_urls.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
- url: https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20201022/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
excerpt_strategy:
strategy: gz-head
count: 1000
- url: https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz
excerpt_strategy:
strategy: gz-head
count: 1000

- url: https://storage.googleapis.com/gcp-public-data--gnomad/legacy/exac_browser/ExAC.r1.sites.vep.vcf.gz
excerpt_strategy:
strategy: gz-head
Expand Down
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/98935d27cc8f0dc0/url.txt
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/f0ed4b0862f1b46b/url.txt
Git LFS file not shown
67 changes: 67 additions & 0 deletions rules/output/annonars/functional.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
## Rules to create build annonars functional annotation database..


rule work_annonars_functional_download_37: # -- download functional data for GRCh37
output:
"work/download/refseq/grch37/{version}/{assembly}_genomic.gff.gz",
shell:
r"""
wget -O {output} \
https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/{wildcards.version}/{wildcards.assembly}/{wildcards.assembly}_genomic.gff.gz
"""


rule work_annonars_functional_download_38: # -- download functional data for GRCh37
output:
"work/download/refseq/grch38/{version}/{assembly}_genomic.gff.gz",
shell:
r"""
wget -O {output} \
https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/{wildcards.version}/{wildcards.assembly}/{wildcards.assembly}_genomic.gff.gz
"""


def output_annonars_functional_input(wildcards):
if wildcards.genome_release == "grch37":
return f"work/download/refseq/grch37/{DV.refseq_fe_37}/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
else:
return f"work/download/refseq/grch38/{DV.refseq_fe_38}/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"


rule output_annonars_functional: # -- build annonars functional RocksDB file
input:
output_annonars_functional_input,
output:
rocksdb_identity=(
"output/full/annonars/functional-{genome_release}-{v_refseq}+{v_annonars}/"
"rocksdb/IDENTITY"
),
spec_yaml=(
"output/full/annonars/functional-{genome_release}-{v_refseq}+{v_annonars}/spec.yaml"
),
wildcard_constraints:
v_refseq=RE_VERSION,
v_annonars=RE_VERSION,
shell:
r"""
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT
zgrep '^#\|RefSeqFE' {input} > $TMPDIR/tmp.gff
annonars functional import -vvv \
--genome-release {wildcards.genome_release} \
--path-in-gff $TMPDIR/tmp.gff \
--path-out-rocksdb $(dirname {output.rocksdb_identity})
varfish-db-downloader tpl \
--template rules/output/annonars/functional.spec.yaml \
--value today={TODAY} \
\
--value version={wildcards.v_refseq}+{wildcards.v_annonars} \
--value v_refseq={wildcards.v_refseq} \
\
--value v_annonars={wildcards.v_annonars} \
--value v_downloader={PV.downloader} \
> {output.spec_yaml}
"""
16 changes: 16 additions & 0 deletions rules/output/annonars/functional.spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
dc.identifier: annonars/functional:{{ version }}-{{ genome_release }}
dc.title: annonars functional elements RocksDB
dc.creator: VarFish Developer Teams
dc.format: application/x-rocksdb
dc.date: {{ today }}
x-version: {{ version }}
x-genome-release: {{ genome_release }}
dc.description: |
RocksDB built from RefSeq Functional Elements (and other sources in
the future).
dc.source:
- PMID:34876495
- https://www.ncbi.nlm.nih.gov/refseq/
x-created-from:
- name: RefSeq Functional Elements
version: {{ v_refseq }}
52 changes: 52 additions & 0 deletions rules/output/annonars/regions.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
## Rules to create build annonars regions annotation database..


rule work_annonars_regions_download: # -- download clingen regions
output:
"work/download/clingen/{genome_release}/{today}/ClinGen_region_curation_list_{genome_release}.tsv",
shell:
r"""
if [[ "{wildcards.genome_release}" == "grch38" ]]; then
GENOME=GRCh37
else
GENOME=GRCh38
fi
wget -O {output} \
ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_$GENOME.tsv
"""


rule output_annonars_regions: # -- build annonars regions RocksDB file
input:
"work/download/clingen/{genome_release}/{date}/ClinGen_region_curation_list_{genome_release}.tsv",
output:
rocksdb_identity=(
"output/full/annonars/regions-{genome_release}-{date}+{v_annonars}/" "rocksdb/IDENTITY"
),
spec_yaml=("output/full/annonars/regions-{genome_release}-{date}+{v_annonars}/spec.yaml"),
wildcard_constraints:
v_refseq=RE_VERSION,
v_annonars=RE_VERSION,
shell:
r"""
if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
>&2 echo "{wildcards.date} is not today"
exit 1
fi
annonars regions import -vvv \
--genome-release {wildcards.genome_release} \
--path-in-clingen {input} \
--path-out-rocksdb $(dirname {output.rocksdb_identity})
varfish-db-downloader tpl \
--template rules/output/annonars/regions.spec.yaml \
--value today={TODAY} \
\
--value version={wildcards.date}+{wildcards.v_annonars} \
\
--value v_annonars={wildcards.v_annonars} \
--value v_downloader={PV.downloader} \
> {output.spec_yaml}
"""
14 changes: 14 additions & 0 deletions rules/output/annonars/regions.spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
dc.identifier: annonars/regions:{{ version }}-{{ genome_release }}
dc.title: annonars regions annotation RocksDB
dc.creator: VarFish Developer Teams
dc.format: application/x-rocksdb
dc.date: {{ today }}
x-version: {{ version }}
x-genome-release: {{ genome_release }}
dc.description: |
RocksDB with region annotation.
dc.source:
- https://search.clinicalgenome.org/kb/gene-dosage
x-created-from:
- name: ClinGen Region Dosage Pathogenicity
version: {{ today }}
6 changes: 6 additions & 0 deletions varfish_db_downloader/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ class DataVersions:
#: Marker file for the tracks version. This allows us to update the
#: tracks BED files later on.
tracks: str
#: RefSeq functional elements for GRCh37.
refseq_fe_37: str
#: RefSeq functional elements for GRCh38.
refseq_fe_38: str


#: The data versions to use.
Expand Down Expand Up @@ -158,6 +162,8 @@ class DataVersions:
clinvar_release=CLINVAR_RELEASE,
clinvar_version=CLINVAR_VERSION,
tracks="0",
refseq_fe_37="105.20201022",
refseq_fe_38="110",
)


Expand Down

0 comments on commit 664295d

Please sign in to comment.