From 871af29deca894b1b39ac9de9b43c594c7c6b62b Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Sun, 14 Jul 2024 16:39:36 +0200 Subject: [PATCH] fix: adjust to upstream data changes (#95) --- Snakefile | 3 -- download_urls.yml | 2 +- environment.yml | 6 ++-- .../1963f3c58ea066be/omim_unmapped_terms.tsv | 4 +-- excerpt-data/652646c24140df2a/mondo.obo | 4 +-- .../Homo_sapiens.GRCh38.109.gtf.gz | 3 -- excerpt-data/6efab26cca71a549/url.txt | 3 -- .../709832e39857a725/fixSeqLiftOverPsl.txt.gz | 4 +-- excerpt-data/8ee47118be15da10/current_README | 4 +-- excerpt-data/a9229376a47367b7/database | 4 +-- excerpt-data/a9dd799550b2e5ae/martservice | 4 +-- .../Homo_sapiens.GRCh38.112.gtf.gz | 3 ++ excerpt-data/e2748774e1b011f7/url.txt | 3 ++ rules/output/viguno/hpo.smk | 20 +------------ rules/reduced/annonars.smk | 2 ++ rules/reduced/hpo.smk | 23 -------------- rules/work/genes/ensembl.smk | 2 +- scripts/genes-integrate-diseases.py | 3 +- varfish_db_downloader/cli.py | 30 ++++++++++++++----- varfish_db_downloader/versions.py | 6 ++-- 20 files changed, 54 insertions(+), 79 deletions(-) delete mode 100644 excerpt-data/6efab26cca71a549/Homo_sapiens.GRCh38.109.gtf.gz delete mode 100644 excerpt-data/6efab26cca71a549/url.txt create mode 100644 excerpt-data/e2748774e1b011f7/Homo_sapiens.GRCh38.112.gtf.gz create mode 100644 excerpt-data/e2748774e1b011f7/url.txt diff --git a/Snakefile b/Snakefile index 81b6599..93b15b7 100644 --- a/Snakefile +++ b/Snakefile @@ -213,7 +213,6 @@ rule all: f"output/full/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype.hpoa", f"output/full/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype_to_genes.txt", f"output/full/viguno/hpo-{DV.hpo}+{PV.viguno}/hpo.bin", - f"output/full/viguno/hpo-{DV.hpo}+{PV.viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", # ----- background/population structural variants and annotations thereof f"output/full/tracks/track-strucvars-dbvar-grch37-{DV.dbvar}+{DV.tracks}/dbvar.bed.gz", f"output/full/tracks/track-strucvars-dbvar-grch38-{DV.dbvar}+{DV.tracks}/dbvar.bed.gz", @@ -275,7 +274,6 @@ rule all: f"output/reduced-dev/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype.hpoa", f"output/reduced-dev/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype_to_genes.txt", f"output/reduced-dev/viguno/hpo-{DV.hpo}+{PV.viguno}/hpo.bin", - f"output/reduced-dev/viguno/hpo-{DV.hpo}+{PV.viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", # -- annonars f"output/reduced-dev/annonars/cadd-grch37-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY", f"output/reduced-dev/annonars/cadd-grch38-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY", @@ -307,7 +305,6 @@ rule all: f"output/reduced-exomes/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype.hpoa", f"output/reduced-exomes/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype_to_genes.txt", f"output/reduced-exomes/viguno/hpo-{DV.hpo}+{PV.viguno}/hpo.bin", - f"output/reduced-exomes/viguno/hpo-{DV.hpo}+{PV.viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", # -- annonars f"output/reduced-exomes/annonars/cadd-grch37-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY", f"output/reduced-exomes/annonars/cadd-grch38-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY", diff --git a/download_urls.yml b/download_urls.yml index ae1e4b7..1fc9868 100644 --- a/download_urls.yml +++ b/download_urls.yml @@ -245,7 +245,7 @@ strategy: head count: 10000 -- url: https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz +- url: https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz excerpt_strategy: strategy: head count: 10000 diff --git a/environment.yml b/environment.yml index fc74e6d..4b735ce 100644 --- a/environment.yml +++ b/environment.yml @@ -43,9 +43,9 @@ dependencies: # Parallel (de)compression. - pigz # Varfish related - - annonars =0.34.0 - - viguno =0.2.0 - - mehari =0.21.1 + - annonars =0.39.0 + - viguno =0.3.1 + - mehari =0.25.5 - varfish-server-worker =0.12.0 # S3 uploads - s5cmd =2.1.0 diff --git a/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv b/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv index 94d4f19..f2accea 100644 --- a/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv +++ b/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:384eccb1d84e8a5036027cd340d2b4b18ce41170de28407f3022d7f3dd2392dc -size 4779 +oid sha256:963e27534cb75f40bc3cd3fbc684fa24d99355dfedb2c772935ce9e41960376c +size 251 diff --git a/excerpt-data/652646c24140df2a/mondo.obo b/excerpt-data/652646c24140df2a/mondo.obo index 9c120a6..649931e 100644 --- a/excerpt-data/652646c24140df2a/mondo.obo +++ b/excerpt-data/652646c24140df2a/mondo.obo @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ce8ebbfdb594fe32335e6c26c6f10d0c085da5f47e74bf4fbef2a15690b6fe8 -size 33100807 +oid sha256:e6d7553588d6ceeed0cd391c8b9734153b56107ea510916deac8449ad34bc476 +size 46638894 diff --git a/excerpt-data/6efab26cca71a549/Homo_sapiens.GRCh38.109.gtf.gz b/excerpt-data/6efab26cca71a549/Homo_sapiens.GRCh38.109.gtf.gz deleted file mode 100644 index a72d343..0000000 --- a/excerpt-data/6efab26cca71a549/Homo_sapiens.GRCh38.109.gtf.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0b8771fdda23e9e9807ce7f2e2e536241a8613b351b54059e37d58b4136d56b8 -size 9335 diff --git a/excerpt-data/6efab26cca71a549/url.txt b/excerpt-data/6efab26cca71a549/url.txt deleted file mode 100644 index 0987d21..0000000 --- a/excerpt-data/6efab26cca71a549/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68bd848832cef612cb27e34bac3a96f85990ed3089d97bc35a41818180ef4342 -size 88 diff --git a/excerpt-data/709832e39857a725/fixSeqLiftOverPsl.txt.gz b/excerpt-data/709832e39857a725/fixSeqLiftOverPsl.txt.gz index b0f34c6..d2d8ce9 100644 --- a/excerpt-data/709832e39857a725/fixSeqLiftOverPsl.txt.gz +++ b/excerpt-data/709832e39857a725/fixSeqLiftOverPsl.txt.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fb983d44360f05f4525a5b975278c6bd23a69e6db3bcf18018757a3e2867639 -size 10118 +oid sha256:349f73aa22fbbd666220f8a8519358d2b9eae347bfd0a61498291de8c3cc4fb9 +size 32691 diff --git a/excerpt-data/8ee47118be15da10/current_README b/excerpt-data/8ee47118be15da10/current_README index 4cb2a23..a551bf5 100644 --- a/excerpt-data/8ee47118be15da10/current_README +++ b/excerpt-data/8ee47118be15da10/current_README @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19037015e82bd0967988248b0c20100b4e2b419ec6b90e0160f4716243aebc6e -size 6666 +oid sha256:ed4252cdad3ec9ea8d7dffd6e077669c1d0c0c8826c4b371391f5a4e742c08ec +size 1284 diff --git a/excerpt-data/a9229376a47367b7/database b/excerpt-data/a9229376a47367b7/database index 0e40ad8..b0b1d55 100644 --- a/excerpt-data/a9229376a47367b7/database +++ b/excerpt-data/a9229376a47367b7/database @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f3e4e48da67e419e382e2d220a73074484806670014ad30d4312cc11f5c464e -size 4036689 +oid sha256:c26feb3077adccd7c4f9538325d2bf0d81493c17b678f6021a287620c6f0c097 +size 4050908 diff --git a/excerpt-data/a9dd799550b2e5ae/martservice b/excerpt-data/a9dd799550b2e5ae/martservice index 995d458..a567835 100644 --- a/excerpt-data/a9dd799550b2e5ae/martservice +++ b/excerpt-data/a9dd799550b2e5ae/martservice @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:339314ac1b42b7229ea1d8f296f8fcf24e4c668d97a692cfabf67e618b1c14ce -size 12377300 +oid sha256:38f89f947d4cf27c1d7e4c9a2cb8c4937ab2cd759169d9aa3812d6bb943c65a4 +size 12526093 diff --git a/excerpt-data/e2748774e1b011f7/Homo_sapiens.GRCh38.112.gtf.gz b/excerpt-data/e2748774e1b011f7/Homo_sapiens.GRCh38.112.gtf.gz new file mode 100644 index 0000000..d8cfee8 --- /dev/null +++ b/excerpt-data/e2748774e1b011f7/Homo_sapiens.GRCh38.112.gtf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3492ecc26617023c27590d240f29ca92f4d27cbd9c784ad77c1e295a8fa936a5 +size 111959 diff --git a/excerpt-data/e2748774e1b011f7/url.txt b/excerpt-data/e2748774e1b011f7/url.txt new file mode 100644 index 0000000..4d910c4 --- /dev/null +++ b/excerpt-data/e2748774e1b011f7/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4e129868c8c179fa549d27f11aad25106a76447e36468bef201fd469359714f +size 88 diff --git a/rules/output/viguno/hpo.smk b/rules/output/viguno/hpo.smk index 9fdd150..5b0f87b 100644 --- a/rules/output/viguno/hpo.smk +++ b/rules/output/viguno/hpo.smk @@ -7,7 +7,7 @@ import os VIGUNO_SIMULATE_THREADS = int(os.environ.get("VIGUNO_SIMULATE_THREADS", 96)) -rule output_viguno_pheno: # -- copy HPO and simulate +rule output_viguno_pheno: # -- copy HPO input: obo="work/download/hpo/{v_hpo}/hp.obo", hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa", @@ -16,7 +16,6 @@ rule output_viguno_pheno: # -- copy HPO and simulate obo="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo", hpoa="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa", phenotype_to_genes="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt", - rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", wildcard_constraints: v_hpo=RE_VERSION, v_viguno=RE_VERSION, @@ -31,22 +30,6 @@ rule output_viguno_pheno: # -- copy HPO and simulate awk -F $'\t' 'BEGIN {{ OFS=FS }} {{ print $3, $4, $1, $2, $6 }}' \ {input.genes_to_phenotype} \ > {output.phenotype_to_genes} - - viguno simulate \ - --ic-base gene \ - --similarity resnik \ - --combiner fun-sim-avg \ - --path-hpo-dir $(dirname {input.obo}) \ - --path-out-rocksdb $(dirname {output.rocksdb_identity}) \ - --min-terms 1 \ - $(if [[ "{RUNS_IN_CI}" == "True" ]]; then \ - echo --max-terms 1; \ - echo --num-simulations 10; \ - echo --only-gene ARID1B; \ - else \ - echo --max-terms 10; \ - fi) \ - --seed 42 """ @@ -55,7 +38,6 @@ rule global_hpo_to_bin: # -- convert to .bin obo="work/download/hpo/{v_hpo}/hp.obo", hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa", genes_to_phenotype="work/download/hpo/{v_hpo}/phenotype_to_genes.txt", - rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", output: bin="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin", spec_yaml=("output/full/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml"), diff --git a/rules/reduced/annonars.smk b/rules/reduced/annonars.smk index cc3bf5c..0ad7134 100644 --- a/rules/reduced/annonars.smk +++ b/rules/reduced/annonars.smk @@ -39,6 +39,8 @@ rule subset_annonars: # -- create exomes subset shell: r""" annonars db-utils copy \ + --skip-cfs dbsnp_by_rsid \ + --skip-cfs clinvar_by_accession \ --path-in $(dirname {input.rocksdb_identity}) \ --path-out $(dirname {output.rocksdb_identity}) \ --path-beds {input.bed} diff --git a/rules/reduced/hpo.smk b/rules/reduced/hpo.smk index ea30413..b182c94 100644 --- a/rules/reduced/hpo.smk +++ b/rules/reduced/hpo.smk @@ -8,14 +8,12 @@ rule subset_viguno_pheno_exomes: # -- create exomes subset obo="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo", hpoa="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa", phenotype_to_genes="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt", - rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", bin="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin", spec_yaml="output/full/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml", output: obo="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo", hpoa="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa", phenotype_to_genes="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt", - rocksdb_identity="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", bin="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin", spec_yaml="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml", wildcard_constraints: @@ -28,8 +26,6 @@ rule subset_viguno_pheno_exomes: # -- create exomes subset cp -a {input.phenotype_to_genes} {output.phenotype_to_genes} cp -a {input.bin} {output.bin} cp -a {input.spec_yaml} {output.spec_yaml} - - cp -ar $(dirname {input.rocksdb_identity})/. $(dirname {output.rocksdb_identity})/. """ @@ -38,14 +34,12 @@ rule subset_worker_pheno_dev: # -- create development subset obo="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo", hpoa="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa", phenotype_to_genes="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt", - rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", bin="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin", spec_yaml="output/full/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml", output: obo="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo", hpoa="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa", phenotype_to_genes="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt", - rocksdb_identity="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY", bin="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin", spec_yaml="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml", wildcard_constraints: @@ -61,21 +55,4 @@ rule subset_worker_pheno_dev: # -- create development subset cp -a {input.phenotype_to_genes} {output.phenotype_to_genes} cp -a {input.bin} {output.bin} cp -a {input.spec_yaml} {output.spec_yaml} - - viguno simulate \ - --ic-base gene \ - --similarity resnik \ - --combiner fun-sim-avg \ - --path-hpo-dir $(dirname {input.obo}) \ - --path-out-rocksdb $(dirname {output.rocksdb_identity}) \ - --min-terms 1 \ - $(if [[ "{RUNS_IN_CI}" == "True" ]]; then \ - echo --max-terms 1; \ - echo --num-simulations 10; \ - echo --only-gene ARID1B; \ - else \ - echo --max-terms 10; \ - echo --num-simulations 100; \ - fi) \ - --seed 42 """ diff --git a/rules/work/genes/ensembl.smk b/rules/work/genes/ensembl.smk index 40e4bf2..8560c67 100644 --- a/rules/work/genes/ensembl.smk +++ b/rules/work/genes/ensembl.smk @@ -13,7 +13,7 @@ rule genes_ensembl_create_xlink: # -- create ENSEMBL gene information xlink tab wget --no-check-certificate \ -O $TMPDIR/current_README \ https://ftp.ensembl.org/pub/current_README - grep "Ensembl Release {DV.ensembl} Databases" $TMPDIR/current_README \ + grep "The current release is Ensembl {DV.ensembl}" $TMPDIR/current_README \ || (echo "Ensembl version is not {DV.ensembl}." && exit 1) echo -e "ensembl_gene_id\tensembl_transcript_id\tentrez_id\tgene_symbol" \ diff --git a/scripts/genes-integrate-diseases.py b/scripts/genes-integrate-diseases.py index a848a11..da42e18 100644 --- a/scripts/genes-integrate-diseases.py +++ b/scripts/genes-integrate-diseases.py @@ -545,7 +545,8 @@ def parse_mondo_obo(path: str) -> List[MondoDisease]: relation=list(map(MondoDiseaseRelation, synonym.scope.split(" "))), ) ) - result.append(MondoDisease(mondo_id=term.id, name=term.name, synonyms=synonyms)) + if term.name: + result.append(MondoDisease(mondo_id=term.id, name=term.name, synonyms=synonyms)) return result diff --git a/varfish_db_downloader/cli.py b/varfish_db_downloader/cli.py index c5cadba..a4f7a34 100644 --- a/varfish_db_downloader/cli.py +++ b/varfish_db_downloader/cli.py @@ -7,6 +7,7 @@ import requests import requests_ftp from loguru import logger +from reretry import retry from varfish_db_downloader import __version__, wget @@ -138,6 +139,10 @@ def urls_download(urls, data_dir, urls_yaml, force): raise click.ClickException("URL discrepancy (see logs above)") +class UrlCheckFailed(Exception): + pass + + @wget_.command() @click.option("--urls-yaml", default="download_urls.yml") @click.argument("urls", nargs=-1) @@ -148,18 +153,29 @@ def urls_check_upstream(urls, urls_yaml): requests_ftp.monkeypatch_session() + def try_get_failed(e: UrlCheckFailed): + logger.info(" failed: {} (maybe retry)", e) + + @retry(tries=5, delay=1, backoff=2, logger=None, fail_callback=try_get_failed) + def try_get(session: requests.Session, url: str): + with s.get(entry.url, allow_redirects=True, stream=True) as r: + if r.ok: + r.close() + else: + raise UrlCheckFailed(str(r)) + error_count = 0 for entry in wget.load_urls_yaml(urls_yaml): s = requests.Session() if not entry.skip_upstream_check and (not urls or entry.url in urls): logger.info(" checking {}...", entry.url) - with s.get(entry.url, allow_redirects=True, stream=True) as r: - if r.ok: - logger.info(" => OK") - r.close() - else: - error_count += 1 - logger.warning(" NOT OK: {}", r) + try: + try_get(s, entry.url) + except UrlCheckFailed as e: + error_count += 1 + logger.warning(" NOT OK: {}", e) + else: + logger.info(" => OK") else: logger.info(" Skipping {}...", entry.url) diff --git a/varfish_db_downloader/versions.py b/varfish_db_downloader/versions.py index f88559c..389580b 100644 --- a/varfish_db_downloader/versions.py +++ b/varfish_db_downloader/versions.py @@ -127,8 +127,8 @@ class DataVersions: clingen_gene=TODAY, clingen_variant=TODAY, ensembl_37="87", - ensembl_38="109", - ensembl="111", + ensembl_38="112", + ensembl="112", today=TODAY, dbnsfp="4.5", dbscsnv="1.1", @@ -155,7 +155,7 @@ class DataVersions: ucsc_genomic_super_dups_38="20141019", ucsc_alt_seq_liftover_37="20200322", ucsc_alt_seq_liftover_38="20221103", - ucsc_fix_seq_liftover_37="20200524", + ucsc_fix_seq_liftover_37="20200609", ucsc_fix_seq_liftover_38="20221103", refseq_37="105", refseq_38="GCF_000001405.40+RS_2023_03",