diff --git a/workflows/host-genome-generation/Dockerfile b/workflows/host-genome-generation/Dockerfile index 63ff42161..dc56abecd 100644 --- a/workflows/host-genome-generation/Dockerfile +++ b/workflows/host-genome-generation/Dockerfile @@ -2,10 +2,33 @@ FROM ubuntu:22.04 ARG DEBIAN_FRONTEND=noninteractive -# this brings in python2.7 -RUN apt-get update && apt-get install -y bowtie2 curl minimap2 +RUN apt-get update && apt-get install -y wget bowtie2 curl minimap2 pigz dh-autoreconf nasm make git g++ unzip python3-pip + +# nescessary for hisat2 +RUN ln -s /usr/bin/python3 /usr/bin/python # Install STAR, the package rna-star does not include STARlong RUN curl -L https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz | tar xz RUN mv STAR-2.5.3a/bin/Linux_x86_64_static/* /usr/local/bin RUN rm -rf STAR-2.5.3a + +# Install fastp (libdeflate libisal (dh-autoreconf nasm)) +WORKDIR /tmp +RUN wget -nv -O - https://github.com/intel/isa-l/archive/refs/tags/v2.30.0.tar.gz | tar zx +RUN cd isa-l-* && ./autogen.sh && ./configure && make -j8 && make install +RUN wget -nv -O - https://github.com/ebiggers/libdeflate/archive/refs/tags/v1.12.tar.gz | tar zx +RUN cd libdeflate-* && make -j8 && make install +RUN ldconfig +RUN git clone https://github.com/mlin/fastp.git && git -C fastp checkout 37edd60 +RUN cd fastp && make -j8 && ./fastp test && cp fastp /usr/local/bin + +# Install hisat2 +WORKDIR /hisat2 +RUN wget -nv -O /tmp/HISAT2.zip https://cloud.biohpc.swmed.edu/index.php/s/oTtGWbWjaxsQ2Ho/download \ + && unzip /tmp/HISAT2.zip && mv hisat2-*/* . && rm /tmp/HISAT2.zip + +# Install kallisto + python gtfparse +RUN curl -L https://github.com/pachterlab/kallisto/releases/download/v0.46.1/kallisto_linux-v0.46.1.tar.gz | tar xz -C / +RUN pip3 install gtfparse==1.2.1 + +WORKDIR / \ No newline at end of file diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index 9398d208f..8faaa4a4c 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -1,150 +1,350 @@ version 1.1 -workflow index_generation { - input { - File input_fasta - File? input_gtf - String host_name - File ercc_fasta - File ercc_gtf - String docker_image_id - } - - call GenerateHostGenome { - input: - input_fasta = input_fasta, - input_gtf = input_gtf, - host_name = host_name, - ercc_fasta = ercc_fasta, - ercc_gtf = ercc_gtf, - docker_image_id = docker_image_id - } - - output { - File original_input_fasta = GenerateHostGenome.original_input_fasta - File? original_input_gtf = GenerateHostGenome.original_input_gtf - File fasta_with_ercc_fa = GenerateHostGenome.fasta_with_ercc_fa - File? gtf_with_ercc_gtf = GenerateHostGenome.gtf_with_ercc_gtf - File star_genome_tar = GenerateHostGenome.star_genome_tar - File bowtie_genome_tar = GenerateHostGenome.bowtie_genome_tar - File minimap2_dna = GenerateHostGenome.minimap2_dna - File minimap2_rna = GenerateHostGenome.minimap2_rna - } +# Build host genome indexes for host_filter.wdl (2022 version) +# - Bowtie2 (genome) +# - HISAT2 (genome + splice junctions) +# - kallisto (transcriptome) +# - minimap2 (used not in short-read-mngs host filtering, but rather the ONT equivalent) +# - STAR (used in old version of short-read-mngs host filtering, kept temporarily so we can support both) +# ERCC sequences are spiked-in to all three indexes. Lastly takes an array of other spike-ins for +# the Bowtie2 and HISAT2 indexes. +# Warning: HISAT2 requires huge RAM to build the spliced index (>200G for human). +# But the index file size and aligner memory usage are relatively small. +workflow host_filter_indexing { + input { + String genome_name + + # host genomic DNA + File genome_fasta_gz + # host transcript models on the above genomic DNA (for HISAT2 spliced alignment) + File? transcripts_gtf_gz + # host transcript sequences (for kallisto) + Array[File] transcripts_fasta_gz = [] + + # ERCC sequences to spike in to the genome and transcript indexes + File ERCC_fasta_gz + File? ERCC_fasta_gtf + + # Additional FASTA file(s) to spike into the Bowtie2 & HISAT2 indexes (e.g. EBV, phiX) + # Sequence names must be unique among all FASTAs! + Array[File] other_fasta_gz = [] + + String docker_image_id + } + + call ensure_gz as genome_fasta { + # accommodate uncompressed genome_fasta_gz; this makes it more convenient to use some of our + # existing host genome FASTAs which we archived without compression. + input: + maybe_gz = genome_fasta_gz, + docker_image_id + } + + call concatenate_and_unzip_fastas { + input: + fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), + docker_image_id, + } + + call bowtie2_build { + input: + fasta = concatenate_and_unzip_fastas.fasta, + genome_name, + docker_image_id, + } + + call hisat2_build { + input: + fasta = concatenate_and_unzip_fastas.fasta, + transcripts_gtf_gz, + genome_name, + docker_image_id, + } + + call kallisto_index { + input: + transcripts_fasta_gz = flatten([transcripts_fasta_gz, [ERCC_fasta_gz]]), + genome_name, + docker_image_id, + } + + call minimap2_index as minimap2_index_dna { + input: + fasta = concatenate_and_unzip_fastas.fasta, + nucleotide_type = "dna", + genome_name, + docker_image_id, + } + + call minimap2_index as minimap2_index_rna { + input: + fasta = concatenate_and_unzip_fastas.fasta, + nucleotide_type = "rna", + genome_name, + docker_image_id, + } + + call star_generate { + input: + fasta = concatenate_and_unzip_fastas.fasta, + ERCC_fasta_gtf, + transcripts_gtf_gz, + genome_name, + docker_image_id, + } + + output { + File bowtie2_index_tar = bowtie2_build.index_tar + File hisat2_index_tar = hisat2_build.index_tar + File kallisto_idx = kallisto_index.idx + File minimap2_dna_mmi = minimap2_index_dna.index_mmi + File minimap2_rna_mmi = minimap2_index_rna.index_mmi + File star_genome_tar = star_generate.star_genome_tar + + # also output the input files, to facilitate archival/provenance + File original_genome_fasta_gz = genome_fasta.gz + File? original_transcripts_gtf_gz = transcripts_gtf_gz + Array[File] original_transcripts_fasta_gz = transcripts_fasta_gz + File original_ERCC_fasta_gz = ERCC_fasta_gz + Array[File] original_other_fasta_gz = other_fasta_gz + } +} + +task ensure_gz { + input { + File maybe_gz + String docker_image_id + } + + String name = basename(maybe_gz) + + command <<< + set -euxo pipefail + mkdir ans + if gzip -t '~{maybe_gz}'; then + cp '~{maybe_gz}' ans/ + else + pigz -c -p 4 '~{maybe_gz}' > 'ans/~{name}.gz' + fi + >>> + + output { + File gz = glob("ans/*")[0] + } + + runtime { + docker: docker_image_id + cpu: 4 + memory: "4GiB" + } +} + +task concatenate_and_unzip_fastas { + input { + Array[File] fasta_gz + String docker_image_id + } + + command <<< + pigz -dc ~{sep(' ',fasta_gz)} > "all.fasta" + >>> + + output { + File fasta = "all.fasta" + } + + runtime { + docker: docker_image_id + cpu: 4 + memory: "4GiB" + } +} + +task bowtie2_build { + input { + File fasta + String genome_name + Int seed = 42 + + Int cpu = 16 + String docker_image_id + } + + command <<< + set -euxo pipefail + TMPDIR=${TMPDIR:-/tmp} + + mkdir -p "$TMPDIR"'/bt2/~{genome_name}' + >&2 bowtie2-build --seed ~{seed} --threads ~{cpu} "~{fasta}" "$TMPDIR"'/bt2/~{genome_name}/~{genome_name}' + >&2 ls -lR "$TMPDIR/bt2" + ln -r -s "$TMPDIR"'/bt2/~{genome_name}' "$TMPDIR"'/bt2/~{genome_name}.bowtie2' + env -C "$TMPDIR/bt2" tar c . > '~{genome_name}.bowtie2.tar' + >>> + + output { + File index_tar = "~{genome_name}.bowtie2.tar" + } + + runtime { + docker: docker_image_id + cpu: cpu + memory: "~{cpu*2}GiB" + } +} + +task hisat2_build { + input { + File fasta + File? transcripts_gtf_gz + String genome_name + + Int cpu = 32 + String docker_image_id + } + + command <<< + set -euxo pipefail + TMPDIR=${TMPDIR:-/tmp} + + mkdir -p "$TMPDIR"'/hisat2/~{genome_name}' + if [[ -n '~{transcripts_gtf_gz}' ]]; then + # convert GTF per http://daehwankimlab.github.io/hisat2/howto/ + /hisat2/hisat2_extract_splice_sites.py <(pigz -dc '~{transcripts_gtf_gz}') > "$TMPDIR/genome.ss" & pid=$! + /hisat2/hisat2_extract_exons.py <(pigz -dc '~{transcripts_gtf_gz}') > "$TMPDIR/genome.exon" + wait $pid + >&2 /hisat2/hisat2-build -p 16 \ + --exon "$TMPDIR/genome.exon" --ss "$TMPDIR/genome.ss" \ + "~{fasta}" "$TMPDIR"'/hisat2/~{genome_name}/~{genome_name}' + else + >&2 /hisat2/hisat2-build -p 16 "~{fasta}" "$TMPDIR"'/hisat2/~{genome_name}/~{genome_name}' + fi + >&2 ls -lR "$TMPDIR/hisat2" + env -C "$TMPDIR/hisat2" tar c . > '~{genome_name}.hisat2.tar' + >>> + + output { + File index_tar = "~{genome_name}.hisat2.tar" + } + + runtime { + docker: docker_image_id + cpu: cpu + memory: "240G" + } } -task GenerateHostGenome { - input { - File input_fasta - File? input_gtf - String host_name - File ercc_fasta - File ercc_gtf - String docker_image_id - } - - command <<< - set -euxo pipefail - - # - # Create fasta_with_ercc - # - - INPUT_FASTA_PATH="~{input_fasta}" - - # Download input fa - if [ ${INPUT_FASTA_PATH: -3} == ".gz" ] - then - gunzip -c $INPUT_FASTA_PATH > input.fa - INPUT_FASTA_PATH=input.fa - else - cp $INPUT_FASTA_PATH input.fa - INPUT_FASTA_PATH=input.fa - fi - - # Concatenate ercc and input - cat "~{ercc_fasta}" $INPUT_FASTA_PATH > fasta_with_ercc.fa - - # - # Create gtf_with_ercc - # - - INPUT_GTF_PATH="~{input_gtf}" - GTF_PATH="~{ercc_gtf}" - - # Download input gtf, if provided - if [[ -n "${INPUT_GTF_PATH}" ]] ; then - if [ ${INPUT_GTF_PATH: -3} == ".gz" ] - then - gunzip -c $INPUT_GTF_PATH > input.gtf - INPUT_GTF_PATH=input.gtf - else - cp $INPUT_GTF_PATH input.gtf - INPUT_GTF_PATH=input.gtf - fi - # Concatenate ercc and input - cat "~{ercc_gtf}" $INPUT_GTF_PATH > gtf_with_ercc.gtf - GTF_PATH=gtf_with_ercc.gtf - fi - - # - # Generate STAR genome - # - - # Make directory for STAR genome - STAR_GENOME="~{host_name}_STAR_genome" - # HACK: we used to support splitting star indexes into many parts, this made things slower - # Here we generate the index as if it is in many parts, but there is only ever one part for - # backwards compatibility - mkdir -p "$STAR_GENOME/part-0" - - AVAILABLE_MEMORY=$(free --bytes | head -n 2 | tail -n 1 | sed "s/ */ /g" | cut -d' ' -f 7) - - STAR \ - --sjdbGTFfile $GTF_PATH \ - --runThreadN $(nproc) \ - --runMode genomeGenerate \ - --genomeFastaFiles fasta_with_ercc.fa \ - --limitGenomeGenerateRAM $AVAILABLE_MEMORY \ - --genomeDir "$STAR_GENOME/part-0" - - # create a parts.txt file for backwards compatibility - echo 1 > "$STAR_GENOME/parts.txt" - - # tar STAR genome - tar cvf "$STAR_GENOME.tar" -C $(pwd) $STAR_GENOME - - # - # Generate bowtie2 genome - # - - # Make directory for bowtie2 genome - BOWTIE2_GENOME="~{host_name}_bowtie2_genome" - mkdir $BOWTIE2_GENOME - - # Change into the directory to contain the output and generate bowtie2 genome - cd $BOWTIE2_GENOME - bowtie2-build ../fasta_with_ercc.fa "~{host_name}" - cd .. - - # tar bowtie2 genome - tar cvf "$BOWTIE2_GENOME.tar" -C $(pwd) $BOWTIE2_GENOME - - minimap2 -x map-ont -d "~{host_name}_minimap2_genome_dna.mmi" fasta_with_ercc.fa - minimap2 -x splice -d "~{host_name}_minimap2_genome_rna.mmi" fasta_with_ercc.fa - >>> - - output { - File original_input_fasta = "input.fa" - File? original_input_gtf = "input.gtf" - File fasta_with_ercc_fa = "fasta_with_ercc.fa" - File? gtf_with_ercc_gtf = "gtf_with_ercc.gtf" - File star_genome_tar = "~{host_name}_STAR_genome.tar" - File bowtie_genome_tar = "~{host_name}_bowtie2_genome.tar" - File minimap2_dna = "~{host_name}_minimap2_genome_dna.mmi" - File minimap2_rna = "~{host_name}_minimap2_genome_rna.mmi" - } - - runtime { - docker: docker_image_id - } +task kallisto_index { + input { + Array[File] transcripts_fasta_gz + String genome_name + + String docker_image_id + } + + String idx_fn = "~{genome_name}.kallisto.idx" + command <<< + set -euxo pipefail + /kallisto/kallisto index --index '~{idx_fn}' ~{sep(' ',transcripts_fasta_gz)} + >&2 ls -l + >>> + + output { + File idx = idx_fn + } + + runtime { + docker: docker_image_id + memory: "16GiB" + } +} + +task minimap2_index { + input { + File fasta + String genome_name + String nucleotide_type + + String docker_image_id + } + + command <<< + set -euxo pipefail + TMPDIR=${TMPDIR:-/tmp} + + if [ "~{nucleotide_type}" == "dna" ]; then + >&2 minimap2 -x map-ont -d '~{genome_name}_~{nucleotide_type}.mmi' "~{fasta}" + else + >&2 minimap2 -x splice -d '~{genome_name}_~{nucleotide_type}.mmi' "~{fasta}" + fi + >&2 ls -l + >>> + + output { + File index_mmi = "~{genome_name}_~{nucleotide_type}.mmi" + } + + runtime { + docker: docker_image_id + memory: "32GiB" + } +} + +task star_generate { + input { + File fasta + File? ERCC_fasta_gtf + File? transcripts_gtf_gz + String genome_name + + + Int cpu = 32 + String docker_image_id + } + + command <<< + set -euxo pipefail + TMPDIR=${TMPDIR:-/tmp} + + gtf_flag="" + if [[ -n '~{transcripts_gtf_gz}' || -n '~{ERCC_fasta_gtf}' ]]; then + transcripts_gtf="$TMPDIR/transcripts.gtf" + gtf_flag="--sjdbGTFfile \"$transcripts_gtf\"" + if [[ -n '~{transcripts_gtf_gz}' ]]; then + pigz -dc '~{transcripts_gtf_gz}' > "$transcripts_gtf" + fi + if [[ -n '~{ERCC_fasta_gtf}' ]]; then + cat '~{ERCC_fasta_gtf}' >> "$transcripts_gtf" + fi + fi + + # Make directory for STAR genome + STAR_GENOME="~{genome_name}_STAR_genome" + # HACK: we used to support splitting star indexes into many parts, this made things slower + # Here we generate the index as if it is in many parts, but there is only ever one part for + # backwards compatibility + mkdir -p "$STAR_GENOME/part-0" + + STAR \ + --runThreadN ~{cpu} \ + --runMode genomeGenerate \ + --genomeFastaFiles "~{fasta}" \ + --limitGenomeGenerateRAM 64000000000 \ + --genomeDir "$STAR_GENOME/part-0" $gtf_flag + + # create a parts.txt file for backwards compatibility + echo 1 > "$STAR_GENOME/parts.txt" + + # tar STAR genome + tar cvf "$STAR_GENOME.tar" -C $(pwd) $STAR_GENOME + >>> + + output { + File star_genome_tar = "~{genome_name}_STAR_genome.tar" + } + + runtime { + docker: docker_image_id + cpu: cpu + memory: "64GiB" + } } diff --git a/workflows/host-genome-generation/test/fixtures/ERCC.fa.gz b/workflows/host-genome-generation/test/fixtures/ERCC.fa.gz new file mode 100644 index 000000000..a2dcf2538 Binary files /dev/null and b/workflows/host-genome-generation/test/fixtures/ERCC.fa.gz differ diff --git a/workflows/host-genome-generation/test/fixtures/ERCC.fasta b/workflows/host-genome-generation/test/fixtures/ERCC.fasta deleted file mode 100644 index e23afdb39..000000000 --- a/workflows/host-genome-generation/test/fixtures/ERCC.fasta +++ /dev/null @@ -1,87 +0,0 @@ ->ERCC-00002 -TCCAGATTACTTCCATTTCCGCCCAAGCTGCTCACAGTATACGGGCGTCGGCATCCAGAC -CGTCGGCTGATCGTGGTTTTACTAGGCTAGACTAGCGTACGAGCACTATGGTCAGTAATT -CCTGGAGGAATAGGTACCAAGAAAAAAACGAACCTTTGGGTTCCAGAGCTGTACGGTCGC -ACTGAACTCGGATAGGTCTCAGAAAAACGAAATATAGGCTTACGGTAGGTCCGAATGGCA -CAAAGCTTGTTCCGTTAGCTGGCATAAGATTCCATGCCTAGATGTGATACACGTTTCTGG -AAACTGCCTCGTCATGCGACTGTTCCCCGGGGTCAGGGCCGCTGGTATTTGCTGTAAAGA -GGGGCGTTGAGTCCGTCCGACTTCACTGCCCCCTTTCAGCCTTTTGGGTCCTGTATCCCA -ATTCTCAGAGGTCCCGCCGTACGCTGAGGACCACCTGAAACGGGCATCGTCGCTCTTCGT -TGTTCGTCGACTTCTAGTGTGGAGACGAATTGCCAGAATTATTAACTGCGCAGTTAGGGC -AGCGTCTGAGGAAGTTTGCTGCGGTTTCGCCTTGACCGCGGGAAGGAGACATAACGATAG -CGACTCTGTCTCAGGGGATCTGCATATGTTTGCAGCATACTTTAGGTGGGCCTTGGCTTC -CTTCCGCAGTCAAAACCGCGCAATTATCCCCGTCCTGATTTACTGGACTCGCAACGTGGG -TCCATCAGTTGTCCGTATACCAAGACGTCTAAGGGCGGTGTACACCCTTTTGAGCAATGA -TTGCACAACCTGCGATCACCTTATACAGAATTATCAATCAAGCTCCCCGAGGAGCGGACT -TGTAAGGACCGCCGCTTTCGCTCGGGTCTGCGGGTTATAGCTTTTCAGTCTCGACGGGCT -AGCACACATCTGGTTGACTAGGCGCATAGTCGCCATTCACAGATTTGCTCGGCAATCAGT -ACTGGTAGGCGTTAGACCCCGTGACTCGTGGCTGAACGGCCGTACAACTCGACAGCCGGT -GCTTGCGTTTTACCCTTAAAAAAAAAAAAAAAAAAAAAAAA ->ERCC-00003 -CAGCAGCGATTAAGGCAGAGGCGTTTGTATCTGCCATTATAAAGAAGTTTCCTCCAGCAA -CTCCTTTCTTAATTCCAAACTTAGCTTCAGTTATAAATTCCCCTCCCATGATTGGGATTT -TATAAACTTTTCTTCCATATAATTCATCTTTCTTCTCATAACCGTCTCCGAAAAACTTCA -ACTTAAATCCAACCTTTAACTGCTCATCAGCCATGTCTCCCACAGCATCAAAAATAGCAG -TTGTTGGACATGTTAAGACACACTGCCCCAATCTCTCTAACATTTGATGCTCTAACTCTG -ACTTTTTAGGGTGGCATATCTGTATTATAAATCCTGGTCTTCCATCTGGTGTTTTTGATG -GAGGGACATATTTCTCAATTCCTGCTTCTGCTGGACACATTATAACTGAACAACCAAAAC -CTGTTGCCTCTGTAGCTGCAATCTTAGCCCACTTCTTTGTAGCTGCTGTTATTAAAACTC -TTGAAACCCATATTGGGAATGCTTCTGCAAATGTATCTTCAATATATACTCCATTTATTT -CCATAGTTTCCCTCCATTAAGATTTTAACAATTATAGTTTATCTTAGGGGCTATTAATAT -CTTATCATTTGGTTTTTAATATTCGATAAATCCATAAATAAAAATATATCAACAATAATT -TTAAATAATCTAAGTATAGGTAATATAACAATTAAAAAGATTTAGAGGGATAGAATTGAA -CGGCATTAGGAGAATTGTTTTAGATATATTGAAGCCGCATGAGCCAAAAATAACAGATAT -GGCATTAAAATTAACATCATTATCAAACATTGATGGGGTTAATATTACAGTCTATGAAAT -AGATAAAGAGACTGAGAATGTTAAAGTTACAATTGAAGGGAATAATTTAGATTTTGATGA -GATTCAGGAAATTATTGAAAGTTTGGGAGGGACTATTCACAGTATAGATGAGGTTGTTGC -AGGTAAAAAGATTATTGAAGAGTTAGAACACCACAAGATAAAAAAAAAAAAAAAAAAAAA -AAA ->ERCC-00004 -TCTTGCTTCAACAATAACGTCTCTTTCAGAAGGCATTGGTATCTTTTCCCCACTTCCAAG -CATTTTTTCAACTAATCTTATGTTATTAACCATTTCCTTAAATTCTTCTGGGTCTGCTGA -CAAAGCATGATCAGGACCTTCCATATTTTTATYTAAGGTAAAGTGCTTCTCAATAACATC -CGCTCCTAAGGCAACAGAAACTACTGGGGCGAGTATTCCCAATGTATGGTCAGAATATCC -CACAGGGATATTGAATATACTTTTCAAGGTTTTAATAGCGTTTAAATTGACATCTTCATA -AGGGGTTGGGTAAGATGAAATACAATGCAATAAAATAATATCCCTGCATCCATTATTTTC -TAAAACTTTAACTGCTTCCCAAATTTCCCCAATATCAGACATTCCTGTAGATAAAATCAC -CGGCTTGCCTGTTTTTGCCACTTTTTCTAATAAGGGATAAAAGGTTAAATCACCAGAGGC -AATTTTAAATCAGGCACATAAAAAAAAAAAAAAAAAAAAAAAA ->ERCC-00007 -TTTATTGGTACGTAATTTCGTCAACCGTTTTTCGGTCTAACTTCTTAATGACTTCTGTAA -TTAACTTTACCGCGTTTTCATAATCATCACGATGCAGCATGGCCGCGTGCGTATGAATGT -AGCGGGTTGCAATGGTAATGGACAGCGCAGGAACGCCATTTGCCGTCAAATGGATGGCAC -CCGAATCAGTTCCGCCGCCGGCAATGGCATCAAATTGGTACGGAATGCCGGCTTCCTCCG -CAGTGGCTACAACTGCATCGCGCAAACCTTTGTGAGAGACCATGGATGCATCGTAAACGA -TAATCTGCGGGCCTTTGCCCATTTTGCTCTGCGCTTCCTTCTCGGAAATGCCAGGCGTGT -CTCCTGCTATCCCTACATCAACACCAAACGCAATATCAGGCTGAATGGTGTGTGCAGCCG -TTTTCGCTCCCCTCAGCCCGACTTCCTCCTGCACGGTTCCGACGCCATACACTATATTTG -GATGATCTGTGTTTTGTAAGTTTCTTAACACATCAATAGCAATCGCACAGCCGATGCGGT -TGTCCCAGGCCTTTGCGAGTAGGAATTTTTCATTGTTCATGACCGTAAATTCAAAATGCG -GAACGATCATATCTCCCGGAAGTACACCCCACTCCAAGGCTTCTTCCCGGCTTGAAGCTC -CAATATCAATAAACATGTCTTTTATTTCCACTGATTTTTTTCTTGCTTCAGGAGACAAAA -TATGAGGCGGCTTAGATCCGATAACCCCTGTGATTTCTCCTTTTTTTGTGACAATGGTGA -CGCGCTGAGCAAGCATAACCTGAGCCCACCAGCCGCCAACGGTTTGAAAACGGATAAAGC -CTTTATCTGTGATTTGTGTCACCATAAAGCCGACTTCATCCAAATGTCCGGCGATCATAA -TTTTCGGGCCGTTTTCTGCACCAGTTTTTTTTGCAATTAAACTGCCCAGGCGATCTGTTG -TCACCTCATCAGCAAATGGTTCTATGTATGATTTCATCACTTGCCTTACTTCTCTTTCAT -TGCCCGGTATGCCTTTTGCATCTGTTAAATCTTTCAGCATGGTCAATGTTTCATCTAATT -TTGCCATGTTCCAAACCCTCCTTGAGCTCGGAAAAAAAAAAAAAAAAAAAAAAAA ->ERCC-00009 -CAATGATAGGCTAGTCTCGCGCAGTACATGGTAGTTCAGCCAATAGATGCCTAGTACGCT -GACGGCATTCAGAGTACGCTGATCGGCTTATGACGTATGTGACGCAGCTCTTAGCGCAAT -GTATGTGCTGTTATCGAAGCCTATGGCTGAGTATGTAACGCTATGGCGTGCTAGTCGTCT -CATATACGTCTGATGACCTCGTATCATGTTATAGGGCTGCGAACTGTCGATGATGGTCAC -GACTCTGTCGATAGCTGTGTGACTCATTCAGAAGGTGTGCAGCCTATATGATACGCAGTC -GCATCCTATCTTACGTGTCAGTACTATGTGTGAGTGCTCCGCCCTAGTGCTGATGTATGC -CCCATAGTGCTCAGTGGAGTCTCTCTTAGCATAGTGTCCGCTCATACATTAGATGGACGG -CTCATTAGTATCATCGTCGGCTGATATAGGTCGTGGCTCCCTGTATATCGAGGTGAGTCT -ATCTGGATCAACGTCGCACTATGATGTGCAAAGTGTCGTCCATGTATAGACAGTGCGCGT -ATCATATAGGATGCGGCGATCTCATACAGCGTTACGGTCGCTGCGTACTGTATAAGGATG -CTCTGTGAACTGTCATCGGTCCGATCAATTAGTCTAGTGTGCGTTATTCAGATCGAGTGA -GTACATGATTCGTCAGTGTGGATCAATTACAGTTAGGCCGCTGACACATTAGTAACGTCG -GCAAGCACTTAGTCGTGTCGTAAGCCAGTGTGTCGTGTCTTAGACGACTGTGTGTGATTC -TCGAGCGATTTATACATCCGTGACAGCGCTTATAGTGTGCTGACAGACTGGTTGGTTATC -CAATGATCGACCTGGAGTCTAATATCTGACCACGCCTTGTAATCGTATGACACGCGCTTG -ACACGACTGAATCCAGCTTAAGAGCCCTGCAACGCGATATACAGGCGCTGCTACCGATAT -AAAAAAAAAAAAAAAAAAAAAAAA - diff --git a/workflows/host-genome-generation/test/fixtures/ERCC.gtf b/workflows/host-genome-generation/test/fixtures/ERCC.gtf deleted file mode 100644 index cf2d5921b..000000000 --- a/workflows/host-genome-generation/test/fixtures/ERCC.gtf +++ /dev/null @@ -1,92 +0,0 @@ -ERCC-00002 ERCC exon 1 1061 0.000000 + . gene_id "ERCC-00002"; transcript_id "DQ459430"; -ERCC-00003 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00003"; transcript_id "DQ516784"; -ERCC-00004 ERCC exon 1 523 0.000000 + . gene_id "ERCC-00004"; transcript_id "DQ516752"; -ERCC-00009 ERCC exon 1 984 0.000000 + . gene_id "ERCC-00009"; transcript_id "DQ668364"; -ERCC-00012 ERCC exon 1 994 0.000000 + . gene_id "ERCC-00012"; transcript_id "DQ883670"; -ERCC-00013 ERCC exon 1 808 0.000000 + . gene_id "ERCC-00013"; transcript_id "EF011062"; -ERCC-00014 ERCC exon 1 1957 0.000000 + . gene_id "ERCC-00014"; transcript_id "DQ875385"; -ERCC-00016 ERCC exon 1 844 0.000000 + . gene_id "ERCC-00016"; transcript_id "DQ883664"; -ERCC-00017 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00017"; transcript_id "DQ459420"; -ERCC-00019 ERCC exon 1 644 0.000000 + . gene_id "ERCC-00019"; transcript_id "DQ883651"; -ERCC-00022 ERCC exon 1 751 0.000000 + . gene_id "ERCC-00022"; transcript_id "DQ855004"; -ERCC-00024 ERCC exon 1 536 0.000000 + . gene_id "ERCC-00024"; transcript_id "DQ854993"; -ERCC-00025 ERCC exon 1 1994 0.000000 + . gene_id "ERCC-00025"; transcript_id "DQ883689"; -ERCC-00028 ERCC exon 1 1130 0.000000 + . gene_id "ERCC-00028"; transcript_id "DQ459419"; -ERCC-00031 ERCC exon 1 1138 0.000000 + . gene_id "ERCC-00031"; transcript_id "DQ459431"; -ERCC-00033 ERCC exon 1 2022 0.000000 + . gene_id "ERCC-00033"; transcript_id "DQ516796"; -ERCC-00034 ERCC exon 1 1019 0.000000 + . gene_id "ERCC-00034"; transcript_id "DQ855001"; -ERCC-00035 ERCC exon 1 1130 0.000000 + . gene_id "ERCC-00035"; transcript_id "DQ459413"; -ERCC-00039 ERCC exon 1 740 0.000000 + . gene_id "ERCC-00039"; transcript_id "DQ883656"; -ERCC-00040 ERCC exon 1 744 0.000000 + . gene_id "ERCC-00040"; transcript_id "DQ883661"; -ERCC-00041 ERCC exon 1 1122 0.000000 + . gene_id "ERCC-00041"; transcript_id "EF011069"; -ERCC-00042 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00042"; transcript_id "DQ516783"; -ERCC-00043 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00043"; transcript_id "DQ516787"; -ERCC-00044 ERCC exon 1 1156 0.000000 + . gene_id "ERCC-00044"; transcript_id "DQ459424"; -ERCC-00046 ERCC exon 1 522 0.000000 + . gene_id "ERCC-00046"; transcript_id "DQ516748"; -ERCC-00048 ERCC exon 1 992 0.000000 + . gene_id "ERCC-00048"; transcript_id "DQ883671"; -ERCC-00051 ERCC exon 1 274 0.000000 + . gene_id "ERCC-00051"; transcript_id "DQ516740"; -ERCC-00053 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00053"; transcript_id "DQ516785"; -ERCC-00054 ERCC exon 1 274 0.000000 + . gene_id "ERCC-00054"; transcript_id "DQ516731"; -ERCC-00057 ERCC exon 1 1021 0.000000 + . gene_id "ERCC-00057"; transcript_id "DQ668366"; -ERCC-00058 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00058"; transcript_id "DQ459418"; -ERCC-00059 ERCC exon 1 525 0.000000 + . gene_id "ERCC-00059"; transcript_id "DQ668356"; -ERCC-00060 ERCC exon 1 523 0.000000 + . gene_id "ERCC-00060"; transcript_id "DQ516763"; -ERCC-00061 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00061"; transcript_id "DQ459426"; -ERCC-00062 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00062"; transcript_id "DQ516786"; -ERCC-00067 ERCC exon 1 644 0.000000 + . gene_id "ERCC-00067"; transcript_id "DQ883653"; -ERCC-00069 ERCC exon 1 1137 0.000000 + . gene_id "ERCC-00069"; transcript_id "DQ459421"; -ERCC-00071 ERCC exon 1 642 0.000000 + . gene_id "ERCC-00071"; transcript_id "DQ883654"; -ERCC-00073 ERCC exon 1 603 0.000000 + . gene_id "ERCC-00073"; transcript_id "DQ668358"; -ERCC-00074 ERCC exon 1 522 0.000000 + . gene_id "ERCC-00074"; transcript_id "DQ516754"; -ERCC-00075 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00075"; transcript_id "DQ516778"; -ERCC-00076 ERCC exon 1 642 0.000000 + . gene_id "ERCC-00076"; transcript_id "DQ883650"; -ERCC-00077 ERCC exon 1 273 0.000000 + . gene_id "ERCC-00077"; transcript_id "DQ516742"; -ERCC-00078 ERCC exon 1 993 0.000000 + . gene_id "ERCC-00078"; transcript_id "DQ883673"; -ERCC-00079 ERCC exon 1 644 0.000000 + . gene_id "ERCC-00079"; transcript_id "DQ883652"; -ERCC-00081 ERCC exon 1 534 0.000000 + . gene_id "ERCC-00081"; transcript_id "DQ854991"; -ERCC-00083 ERCC exon 1 1022 0.000000 + . gene_id "ERCC-00083"; transcript_id "DQ516780"; -ERCC-00084 ERCC exon 1 994 0.000000 + . gene_id "ERCC-00084"; transcript_id "DQ883682"; -ERCC-00085 ERCC exon 1 844 0.000000 + . gene_id "ERCC-00085"; transcript_id "DQ883669"; -ERCC-00086 ERCC exon 1 1020 0.000000 + . gene_id "ERCC-00086"; transcript_id "DQ516791"; -ERCC-00092 ERCC exon 1 1124 0.000000 + . gene_id "ERCC-00092"; transcript_id "DQ459425"; -ERCC-00095 ERCC exon 1 521 0.000000 + . gene_id "ERCC-00095"; transcript_id "DQ516759"; -ERCC-00096 ERCC exon 1 1107 0.000000 + . gene_id "ERCC-00096"; transcript_id "DQ459429"; -ERCC-00097 ERCC exon 1 523 0.000000 + . gene_id "ERCC-00097"; transcript_id "DQ516758"; -ERCC-00098 ERCC exon 1 1143 0.000000 + . gene_id "ERCC-00098"; transcript_id "DQ459415"; -ERCC-00099 ERCC exon 1 1350 0.000000 + . gene_id "ERCC-00099"; transcript_id "DQ875387"; -ERCC-00104 ERCC exon 1 2022 0.000000 + . gene_id "ERCC-00104"; transcript_id "DQ516815"; -ERCC-00108 ERCC exon 1 1022 0.000000 + . gene_id "ERCC-00108"; transcript_id "DQ668365"; -ERCC-00109 ERCC exon 1 536 0.000000 + . gene_id "ERCC-00109"; transcript_id "DQ854998"; -ERCC-00111 ERCC exon 1 994 0.000000 + . gene_id "ERCC-00111"; transcript_id "DQ883685"; -ERCC-00112 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00112"; transcript_id "DQ459422"; -ERCC-00113 ERCC exon 1 840 0.000000 + . gene_id "ERCC-00113"; transcript_id "DQ883663"; -ERCC-00116 ERCC exon 1 1991 0.000000 + . gene_id "ERCC-00116"; transcript_id "DQ668367"; -ERCC-00117 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00117"; transcript_id "DQ459412"; -ERCC-00120 ERCC exon 1 536 0.000000 + . gene_id "ERCC-00120"; transcript_id "DQ854992"; -ERCC-00123 ERCC exon 1 1022 0.000000 + . gene_id "ERCC-00123"; transcript_id "DQ516782"; -ERCC-00126 ERCC exon 1 1118 0.000000 + . gene_id "ERCC-00126"; transcript_id "DQ459427"; -ERCC-00130 ERCC exon 1 1059 0.000000 + . gene_id "ERCC-00130"; transcript_id "EF011072"; -ERCC-00131 ERCC exon 1 771 0.000000 + . gene_id "ERCC-00131"; transcript_id "DQ855003"; -ERCC-00134 ERCC exon 1 274 0.000000 + . gene_id "ERCC-00134"; transcript_id "DQ516739"; -ERCC-00136 ERCC exon 1 1033 0.000000 + . gene_id "ERCC-00136"; transcript_id "EF011063"; -ERCC-00137 ERCC exon 1 537 0.000000 + . gene_id "ERCC-00137"; transcript_id "DQ855000"; -ERCC-00138 ERCC exon 1 1024 0.000000 + . gene_id "ERCC-00138"; transcript_id "DQ516777"; -ERCC-00142 ERCC exon 1 493 0.000000 + . gene_id "ERCC-00142"; transcript_id "DQ883646"; -ERCC-00143 ERCC exon 1 784 0.000000 + . gene_id "ERCC-00143"; transcript_id "DQ668362"; -ERCC-00144 ERCC exon 1 538 0.000000 + . gene_id "ERCC-00144"; transcript_id "DQ854995"; -ERCC-00145 ERCC exon 1 1042 0.000000 + . gene_id "ERCC-00145"; transcript_id "DQ875386"; -ERCC-00147 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00147"; transcript_id "DQ516790"; -ERCC-00148 ERCC exon 1 494 0.000000 + . gene_id "ERCC-00148"; transcript_id "DQ883642"; -ERCC-00150 ERCC exon 1 743 0.000000 + . gene_id "ERCC-00150"; transcript_id "DQ883659"; -ERCC-00154 ERCC exon 1 537 0.000000 + . gene_id "ERCC-00154"; transcript_id "DQ854997"; -ERCC-00156 ERCC exon 1 494 0.000000 + . gene_id "ERCC-00156"; transcript_id "DQ883643"; -ERCC-00157 ERCC exon 1 1019 0.000000 + . gene_id "ERCC-00157"; transcript_id "DQ839618"; -ERCC-00158 ERCC exon 1 1027 0.000000 + . gene_id "ERCC-00158"; transcript_id "DQ516795"; -ERCC-00160 ERCC exon 1 743 0.000000 + . gene_id "ERCC-00160"; transcript_id "DQ883658"; -ERCC-00162 ERCC exon 1 523 0.000000 + . gene_id "ERCC-00162"; transcript_id "DQ516750"; -ERCC-00163 ERCC exon 1 543 0.000000 + . gene_id "ERCC-00163"; transcript_id "DQ668359"; -ERCC-00164 ERCC exon 1 1022 0.000000 + . gene_id "ERCC-00164"; transcript_id "DQ516779"; -ERCC-00165 ERCC exon 1 872 0.000000 + . gene_id "ERCC-00165"; transcript_id "DQ668363"; -ERCC-00168 ERCC exon 1 1024 0.000000 + . gene_id "ERCC-00168"; transcript_id "DQ516776"; -ERCC-00170 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00170"; transcript_id "DQ516773"; -ERCC-00171 ERCC exon 1 505 0.000000 + . gene_id "ERCC-00171"; transcript_id "DQ854994"; diff --git a/workflows/host-genome-generation/test/fixtures/input.fa.gz b/workflows/host-genome-generation/test/fixtures/input.fa.gz new file mode 100644 index 000000000..ff76d23ae Binary files /dev/null and b/workflows/host-genome-generation/test/fixtures/input.fa.gz differ diff --git a/workflows/host-genome-generation/test/fixtures/input.fasta b/workflows/host-genome-generation/test/fixtures/input.fasta deleted file mode 100644 index 53a474ac2..000000000 --- a/workflows/host-genome-generation/test/fixtures/input.fasta +++ /dev/null @@ -1,100 +0,0 @@ ->HiC_scaffold_1 -AAACGGAGCCCGCAGGGGAGCGGGGCGAGGGCCGCGGAGTCGTCCGTGTGACCGCGGCGGCCAGACCGGCGGGTCCTGCG -CACCGGGCTTTGAACACGTCAAGCGACTTCACGACCAGCGCCACTCCCGCGCCACGAGGCCGGGCCCCCGTCACCGGCAG -GACTTGGCCGGGCCCCAGAGCCGCGGGTTCGGCCGCGAGCTCAGCGGGGTGCCGCTCCAGCCCGGGACGCTGGTGGCCGA -GCGGGCTGTGCCTTGGGCCCGGCCCAGAACTGGACGCTGCGGTCCGGGTCCGGAGGAGGGGGTGGGGGGCAGCGTGACGT -TGGGGAAAGCTGACGACAGGGACGTAGCGGAGCGTGGCCAGGCTGGTGGCCTTGGTCAGCGCTGCAGGGTTCGGGGCCCA -GCGTCCTCCCGCCGCCCTCACTGGTTCCGGGAAGGGCCTGGGGGCCGGACGTCGGGGTGAGGCAGCGCTGCTGGGCTCTG -GGCACAAGGGACGGCAGGGACGGCAGGGACGATGGGGACTCCCCCCCACCCCCACATCTCCAGCCGGGGCTTGGTGTCCT -CAGGTGCCACGTCCCTTCTGGGTGAGCCCCGTCCGGGCTGCGGCGGAGCAGGTGAGCAGAGAGGCCCCGGGCGCTCCAGG -ACGGTGGCCGAAGGCGCGCGGGTTCCGCAGGATGAAAGTGGTTTTGCGGAGCCCGCGGCTGGTACCGCGGAGAGAAGGAG -GCGGCGGAAGATCGTGCTCAGGAAATACGCCCTGGCGTTTGCAAAGGCGGTTCCTGCCGGCTTTACCTATCAGCAAATAC -GAACCACGTTGTATTCACATTAATTTTTCTGTTGTCGGTAGTGCTAAAGCAACTGGTCTGTGAAGTGGATGGCAGTCTGT -TTTGTTGGAAAATGGTAATTTTTATTATTTGAGGCCCAAACATAAACAAGTATTGTTTAAATACAGAAAAGATGATTTTA -TCCTCTAAAACTTAATTTTTAAATCGTTGAATGGGTACTGATTTGCATGGATTAGAAATAAAAATCCTGAAAAGGTTCTC -AGCGTAAAGTCTGCTTCCATCCCCCGCCTTCTCTGCGGGCCTCTATACCCCTCTGTGTGAGGCATTCTTACTGATGTTTT -GGAACTCTTTATACATTGAGGATGTGAGATGTGATTAGCTTCTTGCGTACCCTTAAGAATTTCCTAATGCGTGTGTAAAC -AAACATAAGTAGGTAGGCTCACTTTCACCCCCTCCCCCGTTTTTTAGACAAATAGTAGCATTATGGGTACAGGACTCCAC -AGTATGTGGAGAATCTCCCATATATTAGTACATGGAGAACGTCTCCATTTTTTTAAAGCTACATCGTATTTCATTGTACG -GATGTTTCTTATTGGTGGACACTTAAGTGGCTTCAGATTTTTTTGTCACAACAAATAATGCGCAAAAAGCCTTGCATAGA -TGTCATTTTGCACATGTCTTTGGAATCAATCCCTAGGAGTAACATGCTCAAAGACTATGTCCCTTTGTGATTTTATAGCT -GTTTTCGAATTATTCTACAAATGAGTTATAAATTTACTCCTCCAATATTTAAAAATGTTTCTATCTTCACACTTTAACTA -TTGTAAAAATTTTTTGACTTTTGCCAATCTAATGTGTTTTTTTATTATTATTAATAAGGTTGAGCGTCTATTTGTCCCCC -CACCCCCACCCCGCACTGGACTATCTAGTTAAAGTGTTTGCGCATTTATTTGGGTTGCTGATCTTTGTCCTTTTATTGAT -TTTTAGGGGCTTTTTATATATTAAGGATATAGGGTGTGATAGATGGTGAAATATATTTTTCCCATTGTCTATCTTGATTT -GTGAATATGCTGATGGCAGTTTCTTTCATGCAGAAGTTTTATTTTTCTGTAGTTTAATCTATAAATCCTTCATTTATCCA -AATCTGGCATTTGGATTTTCAGTTTTAATTAGAAGAGTTTCTTTAACTACAGAGTTGTAATGAGAATCACTTGTGGTTTC -TGTTAGTACTTTTATGATTTTATTTTTAATGCTTACATAATTGACCTATTCGGAATTTATTTCGATATACAGTGTGAGAT -ATGGGTGCAACTCTTTTATTATTTTGCTGGATAGCTACTCAGTTGTCCTAATGCCATTTATCAGTCAATTTTTTTATTAT -TTATTTGAGATGTCACCTTAAGTGTATAATACTTTCCTCACATATTTTGTGTCTACCTGTGCTTTCTATGTTTTTCTGTT -GTTCTGTATGTCCATTCGTGCGTGAGCCAGCACTTCGGTATCAAGAATATTATATGATTTAGGAACTAGTAGGGCTACCT -CCCTCTCTCAACTCTTTCTTTACAGAGTTTTTCTATTTTGTTTGTTTCTTTTTCTGTTTGAATTTTAGAATCAGATCGTC -TAATCCTCACCCCATAATAAAACAATAAGGCACGAAGATAATAAAAACCTGGATTTTTATTATTTTTATTGAGGTTGCAT -TAATTTATAAAAAAGTCAAGGAAATTGGCATTCTGGATATTGAATTTTCTTGTTGAATAGCATCATAAATCAACTTGTAC -TAGTTTTCTTGTATGATTTTTTGAGGATATTTAAAAATGTTTCTCCTATTGATCTTCACTTCAGTCTATTCCTAGATACT -TCCTCTTTGTTTTTGGCTAATTAAATGGAGCTGATTTCTTCCATTATATCTTCTAAAGCTATTGGTTTCTCTATATTAAT -TTTATAACCTGATATCTTACTAAATTTATTATTTGTAGAAGATTTTCCCTCATTATCTTTGGGGATGAGATGGGTGTATC -TTACCATGTTAAGAAAATATTTAATCTTTTTTTTTTTAATCAAAGTTTATAATGCTGAATTTTATTACTTTTTCAGCTTC -TGTAAAGACGGTCCTATATTTTTATTCCTGGATCTGTTGAAAGATAAAACAAATGACCTGTAAGCAGCTGTTTAAGCCAG -GCTTTTAGAAGTCTTTGTGTTTCACTTTGGCATATCCTCGCCGCTGTCTGTGGCAGGTGTCATTCAGAACCCAGGTCTCC -ACGATCACCTTCTGTTTTCTGACATACTCGTAGCTCAACAGAATGTTGAATTCATTCAATAAAATATAGTAGAAAAAACA -GAGCCGGTGCAGCCATGCCATTATTTCAAAGCTTACATTTTTGTCTCTGAATTTGGAAAATAATAGATATGTATTTCTTT -AACGAAGAAAGCAATGTACAGTATGCCATCTTAAATCTGCGTGTGTGGATGGGTGGGCAGAGGTTGGTGTGTAGATAGGT -AGCTTAGATGAAAATGAGGTTGGTAAATTTAAATTGACACGGTGCATACTTTTTACGCCCTTTTTAAAATGAAATAAGAA -TTAAGAAAGCAAGTAAGTAGAAAATCAACGTCCACAGATGCCTACTCTTAAGATGGACCTGCCGGGCCGGTCTCCGTGGC -AACGGCAGGGTGAGGGGGAAGGAGGACTGAGGAGAAGGTGTTTTTTTCTCTAAAACGCCTTCATTTCCTCAGCTTATTTA -CTGCCTTTCAGTTCTACAGAATCACATTATGATTCTGTGGTGACTGGGAACCAAGAGCAGAAAATCATTTCTACCCCAAG -GCACAGAGCAGGCACTTAGCATCAAAAGGAAGCAATCTGCAGGCTTTTGACAGCGTATTCACCTCCTGGGACTCGGGGTG -GGGGCACAGGGACACCCATCCCTGTGTCTGGAGCCGTAAATCTCTAGCTGTGCATCCAGTGCCCATTTTTGGTGTTACGT -GTGTCATGGAGGTGTCTTTAAATGCTCTGCAAACACTGACATCTGGGTTATTTATTTGTCTCCTCTGCGGAGAAGCTGGC -AATAAACACATCACCCCAAAGCTGGGGAAGGCGGGCAGGAATGACCACATGGAAGAGCTATTAAATTGGTGACATTTTTC -TTCCTACTGAGCCGGCGTCCATAAAACAACTGGACAGGGCCCACCTCCCCGCCAGACAGCAGTAACAGGCGGGTGACGGA -TTGGATGTGGCGGGTGGAGCTCGGCTCCTGGGTTCCCTCAGGGACTTCCGATGGTCGCTCACTTTAGAGTTCATCTCTGT -TTAAACTCCTCAGCGATTTCTTCCGCTGGCACAAGCTCTGAAGCTTGGGCGCCGCCGTCGTGCAGAAAGGGCCAGCCACA -TTGACGCGACGGGAAGCCCTCGGCACGGCGCGGAGCTGGGGCCACCCTGCCGCTGGTGTTGTAGGGCTCCCCGATGTTTC -TCAGAGGGCTTCCGGGGACCGCGGGCACACCGCCGCCCGGGAACCACCGGAAAGCCGCGTGTAGACGGGGCCGACGGCGA -GGCCGGGCAGGGCCACTGCCCACCAGGAAGTGGCGGCCGCAGCACCAGGGTCTCGGTCCACTAAGGTCATGTGTTCACAG -GAGCCTGGGAGGTGAGGGAACTGGGTGAGAACCTGCGGAAGCCACGGGGTGGGGGTGGGGGGCGGGGGTGCCCCGGGGGA -GGGTGGCCCAGTGAAGGGCGGAGGTGGGGACACAGGTTGGCTTCTTGGGACTTTTAAAGGGCTGCTTCATAGGGGCGCCG -TGGCCTGCCTGTGTGCGCGGGGGCTTGTGGGGCCACGCACCGCCGTCCCTCTGCGACCCAGCTCTGTCCCAGGCGCTGCA -GGGAGCGTGACTGTGCCCCGTGGGGCCTGCGCCCCCTGGGGACCACCTGCACGCAGTCTGCCTCGACCCGTGAGCTCGGC -CTCGAGACGGTGGTGGGCACTCCGGTCCAAGAGCTGAGTCCTGGCCTTTGGCTGGTTCCTGCAGGGACGATGTGGACTAG -ACATCCAGGCCCGGACGGGACAGCCTGGGCAGGTCACAGGGGCCCGGGGACGGTGTGGAGCCAGCGTTGTTACTTACGGT -GCAGAAAACAGCAAACTCGCCCTGCACGCGCTGCCATCAATTTGGCAGAGGCCGAGGAGGTGGTGACATGTTCATGTGTA -AAGAAGGGGTCTGAAAACACTCACGGCAAAATGGGACTAATCTCCCTGACAGGTCCATGCCCAGCCGGAGGGACGCTGCC -GGGTCACCCCCACCACACCCTGAGTCAGCGGGGTCGTCCCGGAACCGCAGGGGTCGGGGACCGCGAGGGGTGGCCCCGTG -CGGAGCGCCCTGTCCGAGCAGGACGAGGCTTCCAGACCTGAGGGACAGCCGCGCCCCGCCCCTCCGAGGCTCCCTGAGGT -GGAGCTGTGCGGGACAGCAGGTGGCCCCGTCTGGAAGGAGCTGCCGGAGCAGCAGCGACGCGGACAGTCGACTCCTGCCC -GAGTCCGCCTTTCTCCCCCGAGCGTGCCGTGTGCCCGGCAGCCCCTGCCCCGCAGTCATGTCCTTTACCTGGTGTCCTCA -GCAAACAGCCCTCCGTCACCGACACCATCAGTCTGTGCTGAACTGCGTATCCCCCGCCTTCCTGCCAGCTGCCCCCTCCC -CAGCTTGTGACACTCTGCCCTGGTGGGCTCAGTATCCTGCCCGCAGACGCCCCGTGGAGCCCGAGTTCCCACGATGTCTG -GCTGCTGTGTCTGGCGCCCTGTGCCTCGCGTGAAACCACTTTCTCCTGCGGAGGGTCCTGCTGGCACCCTCCCCTTGTCC -TCCGGCTCATGGGGACAGTGCCCCGGCTGCTCTGACATCGGTCACTCTTGGTGCCGGGCCCGTGCTGTTCTGGGATGGTG -GCGGGATGGGTCCTGGAATGTCACCACCGACCTATGGGAAGTTCAAAGTTTTGGGCCTCGGTTGCCTGTTGCTGATATGA -TCTTTTTATTCATCTCAGATTTTCCATTTTTATCAGAAAGGGGATAAAGTTGCCTCTCTTGGCTTCTCTATTGTGAAAAC -AAAGTAAATAGGATTAACATGTTTTAGCTTTTAGGAAGAATGGCATTTTCAAAATCTAAATAATTACTCTTAGGATGAAA -GTTTTAGAACTTAGTTTTCCTTAGATTTATTGGTTATCATTAAATTAAAAATTTATTTTCTCAGAAACTCTACCCAATGT -CTTGGATTCAAAGCTGGTTTCAGAAGCCAGAAGCCTATGTTTTGTGCTGAGAGTTCCCTGTAAACACCTGAGGACATTGG -ACTCTTTTTGGGGGGGAACGAAACGAGCGTATTGTTGTTATTTTACCAGAACAGGAGAAAAACGCGGGAAGGTTTGTCCG -GAACACGTGCTGGGCGAGGGAGAAGGGGTTGGGGCCTCAGCGCTGGGCGGGCAGAAGGAGGGGAGGCTGCGGGATCCCAC -ACGGCTTCGGGGGCAGAGCCTGGCGGGCTGCGGCCCACGTAAGCGGCACCATCCGGCTCCGTGCCGGCCCCCGGCCCGGC -AGCTGTCCCGCAGGATGCGCGGTGGCTGCCGCCCTGTCAGCCACCTCCTGCTCCCCCAGCTAGTGTGCTGACAGGCGTCC -CAGACTGGGGGTGCTCACGCCGCAGCCAGGGGTCTTCGGAGTCACGGGAGTGTCACAGACCGCCGTCCAGACCCCACCTC -CGGACGCCGAGTTCACGGCTCCGATCTCGTCTTAGGCCTTGATGCTGGTGGTTCTCGCTTTGTTTGGACATCAGTGTTTT -TTAAAACTCGCCAGGGACCCAAATATGAAGCCAGGTTCGACTCACTGACGACGTCGGGCATTTCGGGGCCCCTAGGGCCT -GGTCCTGGGACCGCACGTGACTCTGCAGCAAGAATGTCGTGTGGAGCCCCGTGCTGAGGTGGGAGGGGCAGGAGGGGTCA -GGTGGCCGGAATGACTGGAGTCCACCCCACGTGCGTGTGAAGGGGAGCCAGGGGGCAGCGAGCACCTATGGAGGGCAGAG -GACACGGTGTCATCATGTGGCTCGGCGGCACCGGCGGTGGCCCGGGCCAGGGTCACGGTGCAGACGGACCACACTCGAGA -CCGAACGCCAGAGGACCGGCCACGGTGGAATCGGAGGCCGAGGGAAACTGATCAGGGAAGGCGGGGGCATCCGGAGCCAT -GTCCACTGCATAACGGCAGCAGGAGCTGTGGGGGACGGCCCGGGCCCACTCACCGCGTCCGCTCACCGCCGAGTAGAGGT -GCAAGCAGGAGGCCTGCGGGACACGAGTGGTGTGGCCGAGCGCCTGGAAGTCAGCCTGCGACAGAGTCCGTGGTCACTCA -TCAGGCGAGAGGCTCAGGGAGGGGCCGCGCCGTCCCGGAGTCCCTGTGCCGATCACCGCGCCGTCCGGAGTCCCCGCGCC -GATCACTGCGCCGTCCAGAGTCCCCGTGCCGATCACCGCGCCGTCCGGAGTCCCCGTGCTGATTGCCGCATTGTCCCAGA -GTCCCTGTGCCGATCGCCACCGAGAGGTGGGAGCTGCTCCCACAGGTGCAGGAAGTGACACTGTGTTGCATCGCGTCAGG -AGAGCGGCCCCGCGGACCTCCGCGTCCTTCGCGGCCAAGGCCAGCCGTGGGCCTCCCAGGCCCTGCTCAGAGATCACATG -GTCCTAATTCGAAAGTAAAATTTTTGGAAAGAGATAAGCTTTAAACAATGATGAAATTTTTAAAAATCCCCCTAGATTTT -CCAATTGCAACATTCTTGTAAGTTTGTCCAAACTGAACTTTTCCTCCCCCTGCGTCATGGTTGGCTTCGGATGTGCTCGC -TTTGCTGGGTTCTGGGCAGGAGGTTTCGGGAGAGCTGGCTGGTCGCCGGGCTGGGCTGCGTCCCGCTAGATGTCCATGGG -CAGAGCAACCGTTAAAACGAACGCGGTCCATGCCCACCCACTCACGCGCCCCAGAGAGTGGAGCCCGAGGGCACAGCCCA -CAGACCGGCCCGGACCTCTGTCCCCGGCACGTGGGCTCCAGCTGAAGACACGGCCCCTGGGGCGTTGTGTCCTGGGTGCT -GCCACCTAGGACGCTCCGGCCCAGCCAAGGAGACCCTCCTGCCACCAGCGAGGCCGTGTGGACGACCGGGCGCGCCTTGC diff --git a/workflows/host-genome-generation/test/test_wdl.py b/workflows/host-genome-generation/test/test_wdl.py index 666e1ff93..7935098d5 100644 --- a/workflows/host-genome-generation/test/test_wdl.py +++ b/workflows/host-genome-generation/test/test_wdl.py @@ -7,10 +7,9 @@ class TestIndexGeneration(WDLTestCase): wdl = os.path.join(os.path.dirname(__file__), "..", "host_genome_generation.wdl") common_inputs = { - "input_fasta": os.path.join(os.path.dirname(__file__), "fixtures/input.fasta"), - "host_name": "test", - "ercc_fasta": os.path.join(os.path.dirname(__file__), "fixtures/ERCC.fasta"), - "ercc_gtf": os.path.join(os.path.dirname(__file__), "fixtures/ERCC.gtf"), + "genome_name": "test", + "genome_fasta_gz": os.path.join(os.path.dirname(__file__), "fixtures/input.fa.gz"), + "ERCC_fasta_gz": os.path.join(os.path.dirname(__file__), "fixtures/ERCC.fa.gz"), } def testIndexGeneration(self):