From 9040aa06a55410f7b45c4f8f7cadc1620d2d3860 Mon Sep 17 00:00:00 2001 From: maxulysse Date: Wed, 29 May 2024 18:50:18 +0200 Subject: [PATCH 1/2] fix usage of some reference files --- main.nf | 12 +++--------- nextflow.config | 9 +++++++++ nextflow_schema.json | 4 ++++ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/main.nf b/main.nf index 6e638a85..78c33003 100644 --- a/main.nf +++ b/main.nf @@ -16,19 +16,13 @@ nextflow.enable.dsl = 2 GENOME PARAMETER VALUES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + params.fasta = getGenomeAttribute('fasta') -params.fasta_fai = getGenomeAttribute('fasta_fai') -params.dict = getGenomeAttribute('dict') +params.gene_bed = getGenomeAttribute('bed12') params.gtf = getGenomeAttribute('gtf') -params.gff = getGenomeAttribute('gff') -params.exon_bed = getGenomeAttribute('exon_bed') -params.star_index = getGenomeAttribute('star') -params.dbsnp = getGenomeAttribute('dbsnp') -params.dbsnp_tbi = getGenomeAttribute('dbsnp_tbi') -params.known_indels = getGenomeAttribute('known_indels') -params.known_indels_tbi = getGenomeAttribute('known_indels_tbi') params.snpeff_db = getGenomeAttribute('snpeff_db') params.snpeff_genome = getGenomeAttribute('snpeff_genome') +params.star_index = getGenomeAttribute('star') params.vep_cache_version = getGenomeAttribute('vep_cache_version') params.vep_genome = getGenomeAttribute('vep_genome') params.vep_species = getGenomeAttribute('vep_species') diff --git a/nextflow.config b/nextflow.config index eff8a185..6365f097 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,6 +19,15 @@ params { save_merged_fastq = false feature_type = 'exon' + dbsnp = null + dbsnp_tbi = null + dict = null + exon_bed = null + fasta_fai = null + gff = null + known_indels = null + known_indels_tbi = null + // Sequence read information read_length = 150 // Required for STAR to build index and align reads diff --git a/nextflow_schema.json b/nextflow_schema.json index b3de2e1a..8c9ac00f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -91,6 +91,10 @@ "description": "Path to GFF3 annotation file.", "help_text": "This parameter must be specified if `--genome` or `--gtf` are not specified." }, + "gene_bed": { + "type": "string", + "description": "Path to BED file containing gene intervals. This will be created from the GTF file if not specified." + }, "exon_bed": { "type": "string", "description": "Path to BED file containing exon intervals. This will be created from the GTF file if not specified." From c0dbaa56c40cf61072b221706c31c798106eaca6 Mon Sep 17 00:00:00 2001 From: maxulysse Date: Mon, 10 Jun 2024 16:03:17 +0200 Subject: [PATCH 2/2] stash apply --- main.nf | 41 ++--- subworkflows/local/prepare_genome/main.nf | 193 +++++++++++++++++----- workflows/rnavar/main.nf | 6 +- 3 files changed, 174 insertions(+), 66 deletions(-) diff --git a/main.nf b/main.nf index 78c33003..cdaac0b5 100644 --- a/main.nf +++ b/main.nf @@ -57,29 +57,18 @@ workflow NFCORE_RNAVAR { ch_versions = Channel.empty() - // Initialize fasta file with meta map: - ch_fasta_raw = params.fasta ? Channel.fromPath(params.fasta).map{ it -> [ [id:it.baseName], it ] }.collect() : Channel.empty() - // Initialize file channels based on params, defined in the params.genomes[params.genome] scope - ch_dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) - ch_known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) - ch_gff = params.gff ? Channel.fromPath(params.gff).collect() : Channel.empty() - ch_gtf_raw = params.gtf ? Channel.fromPath(params.gtf).map{ gtf -> [ [ id:gtf.baseName ], gtf ] }.collect() : Channel.empty() + ch_dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) + ch_known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) - // Initialize variant annotation associated channels + // Initialize value channels based on params, defined in the params.genomes[params.genome] scope snpeff_db = params.snpeff_db ?: Channel.empty() vep_cache_version = params.vep_cache_version ?: Channel.empty() vep_genome = params.vep_genome ?: Channel.empty() vep_species = params.vep_species ?: Channel.empty() - seq_platform = params.seq_platform ?: [] - seq_center = params.seq_center ?: [] - - // Initialize value channels based on params, defined in the params.genomes[params.genome] scope - snpeff_db = params.snpeff_db ?: Channel.empty() - vep_cache_version = params.vep_cache_version ?: Channel.empty() - vep_genome = params.vep_genome ?: Channel.empty() - vep_species = params.vep_species ?: Channel.empty() + seq_platform = params.seq_platform ?: [] + seq_center = params.seq_center ?: [] vep_extra_files = [] @@ -95,16 +84,22 @@ workflow NFCORE_RNAVAR { vep_extra_files.add(file(params.spliceai_snv_tbi, checkIfExists: true)) } - PREPARE_GENOME( - ch_fasta_raw, - ch_gff, - ch_gtf_raw, - ch_dbsnp, - ch_known_indels, - params.feature_type) + PREPARE_GENOME ( + params.fasta, + params.gtf, + params.gff, + params.additional_fasta, + params.transcript_fasta, + params.gene_bed, + params.star_index, + params.gencode, + params.featurecounts_group_type, + params.skip_gtf_filter + ) ch_fasta = PREPARE_GENOME.out.fasta ch_star_index = PREPARE_GENOME.out.star_index + ch_gff = PREPARE_GENOME.out.gff ch_dict = params.dict ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect() : PREPARE_GENOME.out.dict ch_fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).map{ it -> [ [id:'fai'], it ] }.collect() diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index a568e1a8..4d979ac2 100755 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -1,80 +1,193 @@ // -// Prepare reference genome files +// Uncompress and prepare reference genome files // include { BEDTOOLS_MERGE } from '../../../modules/nf-core/bedtools/merge/main' include { BEDTOOLS_SORT } from '../../../modules/nf-core/bedtools/sort/main' +include { CUSTOM_CATADDITIONALFASTA } from '../../../modules/nf-core/custom/catadditionalfasta' +include { CUSTOM_GETCHROMSIZES } from '../../../modules/nf-core/custom/getchromsizes' include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary/main' include { GFFREAD } from '../../../modules/nf-core/gffread/main' include { GTF2BED } from '../../../modules/local/gtf2bed' +include { GTF_FILTER } from '../../../modules/local/gtf_filter' +include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GENE_BED } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GFF } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GTF } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../../modules/nf-core/gunzip' include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main' include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/star/genomegenerate/main' -include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GTF } from '../../../modules/nf-core/gunzip/main' include { TABIX_TABIX as TABIX_DBSNP } from '../../../modules/nf-core/tabix/tabix/main' include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../../modules/nf-core/tabix/tabix/main' +include { UNTAR as UNTAR_STAR_INDEX } from '../../../modules/nf-core/untar' workflow PREPARE_GENOME { take: - ch_fasta_raw // file: /path/to/genome.fasta - ch_gff // file: /path/to/genome.gff - ch_gtf_raw // file: /path/to/genome.gtf - ch_dbsnp - ch_known_indels - feature_type + fasta // file: /path/to/genome.fasta + gtf // file: /path/to/genome.gtf + gff // file: /path/to/genome.gff + additional_fasta // file: /path/to/additional.fasta + transcript_fasta // file: /path/to/transcript.fasta + gene_bed // file: /path/to/gene.bed + dbsnp // file: /path/to/dbsnp.vcf + known_indels // file: /path/to/known_indels.vcf + star_index // directory: /path/to/star/index/ + featurecounts_group_type // string: The attribute type used to group feature types in the GTF file when generating the biotype plot with featureCounts + gencode // boolean: whether the genome is from GENCODE + skip_gtf_filter // boolean: Skip filtering of GTF for valid scaffolds and/ or transcript IDs main: ch_versions = Channel.empty() - //Unzip reference genome files if needed - - if (params.fasta.endsWith('.gz')) { - GUNZIP_FASTA(ch_fasta_raw) + // + // Uncompress genome fasta file if required + // + if (fasta.endsWith('.gz')) { + ch_fasta = GUNZIP_FASTA ( [ [:], fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) + } else { + ch_fasta = Channel.value(file(fasta)) + } - ch_fasta = GUNZIP_FASTA.out.gunzip + // + // Uncompress GTF annotation file or create from GFF3 if required + // + if (gtf || gff) { + if (gtf) { + if (gtf.endsWith('.gz')) { + ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) + } else { + ch_gtf = Channel.value(file(gtf)) + } + } else if (gff) { + if (gff.endsWith('.gz')) { + ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) + } else { + ch_gff = Channel.value(file(gff)) + } + ch_gtf = GFFREAD ( ch_gff ).gtf + ch_versions = ch_versions.mix(GFFREAD.out.versions) + } - } else { - ch_fasta = ch_fasta_raw + // Determine whether to filter the GTF or not + def filter_gtf = + (( + // Condition 1: Transcript FASTA file is not provided + !transcript_fasta + )) && + ( + // Condition 2: --skip_gtf_filter is not provided + !skip_gtf_filter + ) + if (filter_gtf) { + GTF_FILTER ( ch_fasta, ch_gtf ) + ch_gtf = GTF_FILTER.out.genome_gtf + ch_versions = ch_versions.mix(GTF_FILTER.out.versions) + } } - if (params.gtf.endsWith('.gz')) { - GUNZIP_GTF(ch_gtf_raw) + // + // Uncompress additional fasta file and concatenate with reference fasta and gtf files + // + def biotype = gencode ? "gene_type" : featurecounts_group_type + if (additional_fasta) { + if (additional_fasta.endsWith('.gz')) { + ch_add_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], additional_fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions) + } else { + ch_add_fasta = Channel.value(file(additional_fasta)) + } - ch_gtf = GUNZIP_GTF.out.gunzip + CUSTOM_CATADDITIONALFASTA ( + ch_fasta.combine(ch_gtf).map { fasta, gtf -> [ [:], fasta, gtf ] }, + ch_add_fasta.map { [ [:], it ] }, + biotype + ) + ch_fasta = CUSTOM_CATADDITIONALFASTA.out.fasta.map { it[1] }.first() + ch_gtf = CUSTOM_CATADDITIONALFASTA.out.gtf.map { it[1] }.first() + ch_versions = ch_versions.mix(CUSTOM_CATADDITIONALFASTA.out.versions) + } + + // + // Uncompress gene BED annotation file or create from GTF if required + // + if (gene_bed) { + if (gene_bed.endsWith('.gz')) { + ch_gene_bed = GUNZIP_GENE_BED ( [ [:], gene_bed ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) + } else { + ch_gene_bed = Channel.value(file(gene_bed)) + } + } else { + ch_gene_bed = GTF2BED ( ch_gtf ).bed + ch_versions = ch_versions.mix(GTF2BED.out.versions) + } + // + // Uncompress transcript fasta file / create if required + // + if (transcript_fasta) { + if (transcript_fasta.endsWith('.gz')) { + ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], transcript_fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions) + } else { + ch_transcript_fasta = Channel.value(file(transcript_fasta)) + } + if (gencode) { + PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta ) + ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta + ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) + } } else { - ch_gtf = ch_gtf_raw + ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta + ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } + // + // Create chromosome sizes file + // + CUSTOM_GETCHROMSIZES ( ch_fasta.map { [ [:], it ] } ) + ch_fai = CUSTOM_GETCHROMSIZES.out.fai.map { it[1] } + ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } + ch_versions = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions) + GATK4_CREATESEQUENCEDICTIONARY(ch_fasta) - GFFREAD(ch_gff, ch_fasta) - SAMTOOLS_FAIDX(ch_fasta, [['id':'genome'], []]) TABIX_DBSNP(ch_dbsnp.flatten().map{ it -> [ [ id:it.baseName ], it ] }) TABIX_KNOWN_INDELS(ch_known_indels.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) - ch_gtf = ch_gtf.mix(GFFREAD.out.gtf) - - GTF2BED(ch_gtf, feature_type) - STAR_GENOMEGENERATE(ch_fasta, ch_gtf) - ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) - ch_versions = ch_versions.mix(GFFREAD.out.versions) - ch_versions = ch_versions.mix(GTF2BED.out.versions) - ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) - ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions) ch_versions = ch_versions.mix(TABIX_KNOWN_INDELS.out.versions) + // + // Uncompress STAR index or generate from scratch if required + // + ch_star_index = Channel.empty() + if (star_index) { + if (star_index.endsWith('.tar.gz')) { + ch_star_index = UNTAR_STAR_INDEX ( [ [:], star_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) + } else { + ch_star_index = Channel.value(file(star_index)) + } + } else { + ch_star_index = STAR_GENOMEGENERATE ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] } ).index.map { it[1] } + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) + } + emit: - dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict - exon_bed = GTF2BED.out.bed.map{ bed -> [ [ id:bed.baseName ], bed ] }.collect() // path: exon.bed - fasta = ch_fasta - fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai - gtf = ch_gtf // path: genome.gtf - star_index = STAR_GENOMEGENERATE.out.index // path: star/index/ + chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi + dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict + fai = ch_fai // channel: path(genome.fai) + fasta = ch_fasta // channel: path(genome.fasta) + gene_bed = ch_gene_bed // channel: path(gene.bed) + gtf = ch_gtf // channel: path(genome.gtf) known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi - versions = ch_versions // channel: [ versions.yml ] - // bedtools_sort = ch_bedtools_sort // path: sort.bed - // bedtools_merge = ch_bedtools_merge // path: merge.bed + star_index = ch_star_index // channel: path(star/index/) + transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/workflows/rnavar/main.nf b/workflows/rnavar/main.nf index ee776519..22454486 100755 --- a/workflows/rnavar/main.nf +++ b/workflows/rnavar/main.nf @@ -218,7 +218,7 @@ workflow RNAVAR { // Gather QC reports ch_reports = ch_reports.mix(ch_bqsr_table.map{ meta, table -> table}) - ch_versions = ch_versions.mix(GATK4_BASERECALIBRATOR.out.versions) + ch_versions = ch_versions.mix(GATK4_BASERECALIBRATOR.out.versions) ch_bam_applybqsr = ch_splitncigar_bam_bai.join(ch_bqsr_table) ch_bam_recalibrated_qc = Channel.empty() @@ -256,8 +256,8 @@ workflow RNAVAR { ch_dbsnp_for_haplotypecaller = [[id:'null'], []] ch_dbsnp_for_haplotypecaller_tbi = [[id:'null'], []] } else { - ch_dbsnp_for_haplotypecaller = ch_dbsnp.map{ vcf -> [[id:'dbsnp'], vcf] } - ch_dbsnp_for_haplotypecaller_tbi = ch_dbsnp_tbi.map{ tbi -> [[id:'dbsnp'], tbi] } + ch_dbsnp_for_haplotypecaller = ch_dbsnp.map{ it -> [[id:it.baseName], it] }, + ch_dbsnp_for_haplotypecaller_tbi = ch_dbsnp_tbi.map{ it -> [[id:it.baseName], it] } } ch_haplotypecaller_vcf = Channel.empty()