diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 82f7ebfd..4636b9f1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,7 @@ jobs: TEST_PROFILE: - "test" - "test_sim" + - "test_quilt" steps: - name: Check out pipeline code uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d437c83..b4144627 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co - Test impute and test sim works - [#19](https://github.com/nf-core/phaseimpute/pull/19) - Changed reference panel to accept a csv, update modules and subworkflows (glimpse1/2 and shapeit5) - [#20](https://github.com/nf-core/phaseimpute/pull/20) - Added automatic detection of vcf contigs for the reference panel and automatic renaming available +- [#26](https://github.com/nf-core/phaseimpute/pull/26) - Added QUILT method ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 31f66a91..10c0d290 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,9 +10,21 @@ ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [QUILT](https://pubmed.ncbi.nlm.nih.gov/34083788/) - > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + > Davies, R. W., Kucka, M., Su, D., Shi, S., Flanagan, M., Cunniff, C. M., ... & Myers, S. (2021). Rapid genotype imputation from sequence with reference panels. Nature genetics, 53(7), 1104-1111. + +- [GLIMPSE](https://www.nature.com/articles/s41588-020-00756-0) + + > Rubinacci, S., Ribeiro, D. M., Hofmeister, R. J., & Delaneau, O. (2021). Efficient phasing and imputation of low-coverage sequencing data using large reference panels. Nature Genetics, 53(1), 120-126. + +- [Shapeit](https://odelaneau.github.io/shapeit5/) + + > Hofmeister RJ, Ribeiro DM, Rubinacci S., Delaneau O. (2023). Accurate rare variant phasing of whole-genome and whole-exome sequencing data in the UK Biobank. Nature Genetics doi: https://doi.org/10.1038/s41588-023-01415-w + +- [bcftools](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3198575/) + + > Li, H. (2011). A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics, 27(21), 2987-2993. - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) diff --git a/README.md b/README.md index 7e2fb3a9..1ac75567 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,14 @@ For further information or help, don't hesitate to get in touch on the [Slack `# +You can cite one of the main imputation methods ([`QUILT`](https://github.com/rwdavies/QUILT)) as follows: + +> **Rapid genotype imputation from sequence with reference panels.** +> +> Davies, R. W., Kucka, M., Su, D., Shi, S., Flanagan, M., Cunniff, C. M., Chan, Y. F., & Myers, S. +> +> _Nature genetics_ 2021 June 03. doi: [10.1038/s41588-021-00877-0](https://doi.org/10.1038/s41588-021-00877-0) + An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/conf/quilt_subworkflow.config b/conf/quilt_subworkflow.config new file mode 100644 index 00000000..6f237032 --- /dev/null +++ b/conf/quilt_subworkflow.config @@ -0,0 +1,135 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + pattern: '*_versions.yml' + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:GLIMPSE_CHUNK' { + + ext.prefix = { "${meta.id}_${meta.chr}" } + + publishDir = [ + [ + path: { "${params.outdir}/quilt_impute/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}_chunk" }, + mode: params.publish_dir_mode, + ], + + + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX' { + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX_2' { + ext.args = '--tbi' + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX_3' { + ext.args = '--tbi' + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_VIEW' { + ext.args = '-v snps -Oz' + ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_NORM' { + ext.args = '-m +any --output-type z' + ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_CONVERT' { + ext.args = '--haplegendsample test' + ext.prefix = { "${meta.id}_${meta.chr}_convert" } + cpus = 2 + memory = 400.MB + maxRetries = 2 + + publishDir = [ + [ + path: { "${params.outdir}/quilt_impute/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}/convert" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:QUILT_QUILT' { + publishDir = [ + [ + path: { "${params.outdir}/quilt_impute/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:BCFTOOLS_INDEX' { + ext.args = {[ + "--tbi", + ].join(" ").trim()} + } + + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CONCATENATE_BCFTOOLS:BCFTOOLS_CONCAT' { + ext.args = {[ + "--ligate", + "--output-type z", + ].join(" ").trim()} + + cpus = 2 + memory = 1.GB + maxRetries = 2 + + publishDir = [ + [ + path: { "${params.outdir}/quilt_impute/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}/concat" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CONCATENATE_BCFTOOLS:BCFTOOLS_INDEX' { + ext.args = {[ + "--tbi", + ].join(" ").trim()} + + publishDir = [ + [ + path: { "${params.outdir}/quilt_impute/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}/concat" }, + mode: params.publish_dir_mode, + ], + ] + } + +} diff --git a/conf/test_quilt.config b/conf/test_quilt.config new file mode 100644 index 00000000..a7d04a00 --- /dev/null +++ b/conf/test_quilt.config @@ -0,0 +1,34 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/phaseimpute -profile test_quilt, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Minimal Quilt Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function using the tool QUILT' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '2.GB' + max_time = '1.h' + + // Input data + input = "${projectDir}/tests/csv/sample_bam.csv" + input_region = "${projectDir}/tests/csv/region.csv" + + // Genome references + fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" + panel = "${projectDir}/tests/csv/panel.csv" + phased = true + + // Impute parameters + step = "impute" + tools = "quilt" +} diff --git a/docs/output.md b/docs/output.md index 3b82d32d..7c589a4a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,37 +12,48 @@ The directories listed below will be created in the results directory after the -## Pipeline overview +## Pipeline overview: QUILT imputation mode -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC +- [Glimpse Chunk](#glimpse) - Create chunks of the reference panel +- [Remove Multiallelics](#multiallelics) - Remove multiallelic sites from the reference panel +- [Convert](#convert) - Convert reference panel to .hap and .legend files +- [QUILT](#quilt) - Perform imputation +- [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF. +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +### Glimpse Chunk -
-Output files +- `quilt_impute/glimpse/` + - `*.txt`: TXT file containing the chunks obtained from running Glimpse chunks. -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +[Glimpse chunk](https://odelaneau.github.io/GLIMPSE/) defines chunks where to run imputation. For further reading and documentation see the [Glimpse documentation](https://odelaneau.github.io/GLIMPSE/glimpse1/commands.html). Once that you have generated the chunks for your reference panel, you can skip the reference preparation step and directly submit this file for imputation. -
+### Convert + +- `quilt_impute/bcftools/convert/` + - `*.hap`: a .hap file for the reference panel. + - `*.legend*`: a .legend file for the reference panel. + +[bcftools](https://samtools.github.io/bcftools/bcftools.html) aids in the conversion of vcf files to .hap and .legend files. A .samples file is also generated. Once that you have generated the hap and legend files for your reference panel, you can skip the reference preparation step and directly submit these files for imputation (to be developed). + +### QUILT -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +- `quilt_impute/quilt/` +- `quilt.*.vcf.gz`: Imputed VCF for a specific chunk. +- `quilt.*.vcf.gz.tbi`: TBI for the Imputed VCF for a specific chunk. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +[quilt](https://github.com/rwdavies/QUILT) performs the imputation. This step will contain the VCF for each of the chunks. -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +### Concat -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +- `quilt_impute/bcftools/concat` +- `.*.vcf.gz`: Imputed and ligated VCF for all the input samples. -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +[bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs in chunks. ### MultiQC diff --git a/docs/usage.md b/docs/usage.md index ec9617d8..1fd90ac8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -18,48 +18,73 @@ ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' ``` -### Multiple runs of the same sample +### Structure + +The samplesheet can have as many columns as you desire, however, there is a strict requirement for at least 3 columns to match those defined in the table below. -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +A final samplesheet file may look something like the one below. This is for 6 samples. -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +```console +sample,bam,bai +SAMPLE1,AEG588A1.bam,AEG588A1.bai +SAMPLE2,AEG588A2.bam,AEG588A2.bai +SAMPLE3,AEG588A3.bam,AEG588A3.bai +SAMPLE4,AEG588A4.bam,AEG588A4.bai +SAMPLE5,AEG588A5.bam,AEG588A5.bai +SAMPLE6,AEG588A6.bam,AEG588A6.bai ``` -### Full samplesheet +| Column | Description | +| -------- | -------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`). | +| `bam` | Full path to a BAM file. File has to be gzipped and have the extension ".bam.gz".gz". | +| `bai` | Full path to a BAI file. File has to be gzipped and have the extension ".bam" or ".fq.gz". | + +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +## Samplesheet reference panel -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +You will need to create a samplesheet with information about the reference panel you would like to use. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +```bash +--panel '[path to samplesheet file]' ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +### Structure -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +A final samplesheet file for the reference panel may look something like the one below. This is for 3 chromosomes. + +```console +chr,vcf +1,ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz +2,ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz +3,ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz +``` + +| Column | Description | +| ------ | --------------------------------------------------------------------------------------------------------- | +| `chr` | Name of the chromosome. Use the prefix 'chr' if the panel uses the prefix. | +| `vcf` | Full path to a VCF file for that chromosome. File has to be gzipped and have the extension ".vcf.gz".gz". | + +An [example samplesheet](../assets/samplesheet_reference.csv) has been provided with the pipeline. + +Remember to use the same reference genome for all the files. You can specify the [reference genome](https://nf-co.re/docs/usage/reference_genomes) using: + +```bash +--genome GRCh37 +``` + +or you can specify a custom genome using: + +```bash +--fasta Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz +``` ## Running the pipeline @@ -67,7 +92,6 @@ The typical command for running the pipeline is as follows: ```bash nextflow run nf-core/phaseimpute --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker -nextflow run nf-core/phaseimpute --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -109,6 +133,18 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +### Imputation modes + +You can choose different software to perform the imputation. + +#### QUILT + +The typical command for running the pipeline with this software is as follows: + +```bash +nextflow run nf-core/phaseimpute --input ./samplesheet.csv --panel ./samplesheet_reference.csv --step impute --tool quilt --outdir ./results --genome GRCh37 -profile docker +``` + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/modules.json b/modules.json index bfc15ccf..708fac05 100644 --- a/modules.json +++ b/modules.json @@ -11,10 +11,26 @@ "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/annotate/bcftools-annotate.diff" }, + "bcftools/concat": { + "branch": "master", + "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", + "installed_by": ["modules"], + "patch": "modules/nf-core/bcftools/concat/bcftools-concat.diff" + }, + "bcftools/convert": { + "branch": "master", + "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", + "installed_by": ["modules"] + }, "bcftools/index": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["multiple_impute_glimpse2", "vcf_impute_glimpse", "vcf_phase_shapeit5"] + "installed_by": [ + "modules", + "multiple_impute_glimpse2", + "vcf_impute_glimpse", + "vcf_phase_shapeit5" + ] }, "bcftools/mpileup": { "branch": "master", @@ -93,6 +109,11 @@ "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", "installed_by": ["modules"] }, + "quilt/quilt": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "samtools/coverage": { "branch": "master", "git_sha": "38afbe42f7db7f19c7a89607c0a71c68f3be3131", diff --git a/modules/nf-core/bcftools/concat/bcftools-concat.diff b/modules/nf-core/bcftools/concat/bcftools-concat.diff new file mode 100644 index 00000000..256660aa --- /dev/null +++ b/modules/nf-core/bcftools/concat/bcftools-concat.diff @@ -0,0 +1,21 @@ +Changes in module 'nf-core/bcftools/concat' +--- modules/nf-core/bcftools/concat/main.nf ++++ modules/nf-core/bcftools/concat/main.nf +@@ -21,11 +21,14 @@ + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ ++ ++ ls -1v ${vcfs} > order_files.txt ++ + bcftools concat \\ + --output ${prefix}.vcf.gz \\ + $args \\ + --threads $task.cpus \\ +- ${vcfs} ++ -f order_files.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/bcftools/concat/environment.yml b/modules/nf-core/bcftools/concat/environment.yml new file mode 100644 index 00000000..ff0200df --- /dev/null +++ b/modules/nf-core/bcftools/concat/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_concat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/concat/main.nf b/modules/nf-core/bcftools/concat/main.nf new file mode 100644 index 00000000..e3281f46 --- /dev/null +++ b/modules/nf-core/bcftools/concat/main.nf @@ -0,0 +1,49 @@ +process BCFTOOLS_CONCAT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta), path(vcfs), path(tbi) + + output: + tuple val(meta), path("*.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + + ls -1v ${vcfs} > order_files.txt + + bcftools concat \\ + --output ${prefix}.vcf.gz \\ + $args \\ + --threads $task.cpus \\ + -f order_files.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/concat/meta.yml b/modules/nf-core/bcftools/concat/meta.yml new file mode 100644 index 00000000..91cb54d5 --- /dev/null +++ b/modules/nf-core/bcftools/concat/meta.yml @@ -0,0 +1,51 @@ +name: bcftools_concat +description: Concatenate VCF files +keywords: + - variant calling + - concat + - bcftools + - VCF +tools: + - concat: + description: | + Concatenate VCF files. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcfs: + type: list + description: | + List containing 2 or more vcf files + e.g. [ 'file1.vcf', 'file2.vcf' ] + - tbi: + type: list + description: | + List containing 2 or more index files (optional) + e.g. [ 'file1.tbi', 'file2.tbi' ] +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF concatenated output file + pattern: "*.{vcf.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@nvnieuwk" +maintainers: + - "@abhi18av" + - "@nvnieuwk" diff --git a/modules/nf-core/bcftools/concat/tests/main.nf.test b/modules/nf-core/bcftools/concat/tests/main.nf.test new file mode 100644 index 00000000..bf1a5f3f --- /dev/null +++ b/modules/nf-core/bcftools/concat/tests/main.nf.test @@ -0,0 +1,108 @@ +nextflow_process { + + name "Test Process BCFTOOLS_CONCAT" + script "../main.nf" + process "BCFTOOLS_CONCAT" + + tag "modules" + tag "modules_nfcore" + tag "bcftools" + tag "bcftools/concat" + + config "./nextflow.config" + + test("sarscov2 - [[vcf1, vcf2], [tbi1, tbi2]]") { + + when { + process { + """ + input[0] = [ + [ id:'test3' ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_haplotc_cnn_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf_gz'], checkIfExists: true) + ], + [ + file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_haplotc_cnn_vcf_gz_tbi'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [[vcf1, vcf2], []]") { + + when { + process { + """ + input[0] = [ + [ id:'test3' ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_haplotc_cnn_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf_gz'], checkIfExists: true) + ], + [] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [[vcf1, vcf2], [tbi1, tbi2]] - stub") { + + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test3' ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_haplotc_cnn_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf_gz'], checkIfExists: true) + ], + [ + file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_haplotc_cnn_vcf_gz_tbi'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.vcf[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bcftools/concat/tests/main.nf.test.snap b/modules/nf-core/bcftools/concat/tests/main.nf.test.snap new file mode 100644 index 00000000..7344e6e3 --- /dev/null +++ b/modules/nf-core/bcftools/concat/tests/main.nf.test.snap @@ -0,0 +1,43 @@ +{ + "sarscov2 - [[vcf1, vcf2], []]": { + "content": [ + [ + [ + { + "id": "test3" + }, + "test3.vcf.gz:md5,4bcd0afd89f56c5d433f6b6abc44d0a6" + ] + ], + [ + "versions.yml:md5,24ae05eb858733b40fbd3f89743a6d09" + ] + ], + "timestamp": "2023-11-29T13:52:27.03724666" + }, + "sarscov2 - [[vcf1, vcf2], [tbi1, tbi2]]": { + "content": [ + [ + [ + { + "id": "test3" + }, + "test3.vcf.gz:md5,4bcd0afd89f56c5d433f6b6abc44d0a6" + ] + ], + [ + "versions.yml:md5,24ae05eb858733b40fbd3f89743a6d09" + ] + ], + "timestamp": "2023-11-29T13:52:21.468988293" + }, + "sarscov2 - [[vcf1, vcf2], [tbi1, tbi2]] - stub": { + "content": [ + "test3.vcf.gz", + [ + "versions.yml:md5,24ae05eb858733b40fbd3f89743a6d09" + ] + ], + "timestamp": "2023-11-29T13:41:04.716017811" + } +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/concat/tests/nextflow.config b/modules/nf-core/bcftools/concat/tests/nextflow.config new file mode 100644 index 00000000..f3e1e98c --- /dev/null +++ b/modules/nf-core/bcftools/concat/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = "--no-version" +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/concat/tests/tags.yml b/modules/nf-core/bcftools/concat/tests/tags.yml new file mode 100644 index 00000000..21710d4e --- /dev/null +++ b/modules/nf-core/bcftools/concat/tests/tags.yml @@ -0,0 +1,2 @@ +bcftools/concat: + - "modules/nf-core/bcftools/concat/**" diff --git a/modules/nf-core/bcftools/convert/environment.yml b/modules/nf-core/bcftools/convert/environment.yml new file mode 100644 index 00000000..53e12e07 --- /dev/null +++ b/modules/nf-core/bcftools/convert/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_convert +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/convert/main.nf b/modules/nf-core/bcftools/convert/main.nf new file mode 100644 index 00000000..c01c2b21 --- /dev/null +++ b/modules/nf-core/bcftools/convert/main.nf @@ -0,0 +1,73 @@ +process BCFTOOLS_CONVERT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + path(bed) + + output: + tuple val(meta), path("*.vcf.gz"), optional:true , emit: vcf_gz + tuple val(meta), path("*.vcf") , optional:true , emit: vcf + tuple val(meta), path("*.bcf.gz"), optional:true , emit: bcf_gz + tuple val(meta), path("*.bcf") , optional:true , emit: bcf + tuple val(meta), path("*.hap.gz"), optional:true , emit: hap + tuple val(meta), path("*.legend.gz"), optional:true , emit: legend + tuple val(meta), path("*.samples"), optional:true , emit: samples + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def regions = bed ? "--regions-file $bed" : "" + def reference = fasta ? "--fasta-ref $fasta" : "" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "--output ${prefix}.bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "--output ${prefix}.bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "--output ${prefix}.vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "--output ${prefix}.vcf" : + args.contains("--haplegendsample") || args.contains("-h") ? "" : + "--output ${prefix}.vcf.gz" + + """ + bcftools convert \\ + $args \\ + $regions \\ + $extension \\ + --threads $task.cpus \\ + $reference \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf.gz" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/convert/meta.yml b/modules/nf-core/bcftools/convert/meta.yml new file mode 100644 index 00000000..2c89112f --- /dev/null +++ b/modules/nf-core/bcftools/convert/meta.yml @@ -0,0 +1,94 @@ +name: "bcftools_convert" +description: Converts certain output formats to VCF +keywords: + - bcftools + - convert + - vcf + - gvcf +tools: + - "bcftools": + description: "BCFtools is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart BCF. All commands work transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed. Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatically even when streaming from a pipe. Indexed VCF and BCF will work in all situations. Un-indexed VCF and BCF and streams will work in most, but not all situations." + homepage: "https://samtools.github.io/bcftools/bcftools.html" + documentation: "https://samtools.github.io/bcftools/bcftools.html#convert" + tool_dev_url: "https://github.com/samtools/bcftools" + doi: "10.1093/gigascience/giab008" + licence: ["GPL"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: | + The input format. Each format needs a seperate parameter to be specified in the `args`: + - GEN/SAMPLE file: `--gensample2vcf` + - gVCF file: `--gvcf2vcf` + - HAP/SAMPLE file: `--hapsample2vcf` + - HAP/LEGEND/SAMPLE file: `--haplegendsample2vcf` + - TSV file: `--tsv2vcf` + pattern: "*.{gen,sample,g.vcf,hap,legend}{.gz,}" + - input_index: + type: file + description: (Optional) The index for the input files, if needed + pattern: "*.bed" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: (Optional) The reference fasta, only needed for gVCF conversion + pattern: "*.{fa,fasta}" + - bed: + type: file + description: (Optional) The BED file containing the regions for the VCF file + pattern: "*.bed" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf_gz: + type: file + description: VCF merged output file (bgzipped) => when `--output-type z` is used + pattern: "*.vcf.gz" + - vcf: + type: file + description: VCF merged output file => when `--output-type v` is used + pattern: "*.vcf" + - bcf_gz: + type: file + description: BCF merged output file (bgzipped) => when `--output-type b` is used + pattern: "*.bcf.gz" + - bcf: + type: file + description: BCF merged output file => when `--output-type u` is used + pattern: "*.bcf" + - hap: + type: file + description: hap format used by IMPUTE2 and SHAPEIT + pattern: "*.hap.gz" + - legend: + type: file + description: legend format used by IMPUTE2 and SHAPEIT + pattern: "*.legend.gz" + - sample: + type: file + description: sample format used by IMPUTE2 and SHAPEIT + pattern: "*.samples" +authors: + - "@nvnieuwk" + - "@ramprasadn" + - "@atrigila" +maintainers: + - "@nvnieuwk" + - "@ramprasadn" + - "@atrigila" diff --git a/modules/nf-core/quilt/quilt/environment.yml b/modules/nf-core/quilt/quilt/environment.yml new file mode 100644 index 00000000..9872e819 --- /dev/null +++ b/modules/nf-core/quilt/quilt/environment.yml @@ -0,0 +1,7 @@ +name: quilt_quilt +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::r-quilt=1.0.5 diff --git a/modules/nf-core/quilt/quilt/main.nf b/modules/nf-core/quilt/quilt/main.nf new file mode 100644 index 00000000..3068ba7c --- /dev/null +++ b/modules/nf-core/quilt/quilt/main.nf @@ -0,0 +1,63 @@ +process QUILT_QUILT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-quilt:1.0.5--r43h06b5641_0': + 'biocontainers/r-quilt:1.0.5--r43h06b5641_0' }" + + input: + tuple val(meta), path(bams), path(bais), path(bamlist), path(reference_haplotype_file), path(reference_legend_file), val(chr), val(regions_start), val(regions_end), val(ngen), val(buffer), path(genetic_map_file) + tuple val(meta2), path(posfile), path(phasefile) + tuple val(meta3), path(fasta) + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + tuple val(meta), path("*.vcf.gz.tbi"), emit: tbi, optional:true + tuple val(meta), path("RData", type: "dir"), emit: rdata, optional:true + tuple val(meta), path("plots", type: "dir"), emit: plots, optional:true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extensions = bams.collect { it.extension } + def extension = extensions.flatten().unique() + def list_command = extension == ["bam"] ? "--bamlist=${bamlist}" : + extension == ["cram"] ? "--cramlist=${bamlist} --reference=${fasta}" : "" + def genetic_map_file_command = genetic_map_file ? "--genetic_map_file=${genetic_map_file}" : "" + def posfile_command = posfile ? "--posfile=${posfile}" : "" + def phasefile_command = phasefile ? "--phasefile=${phasefile}" : "" + if (!(args ==~ /.*--seed.*/)) {args += " --seed=1"} + + """ + + + QUILT.R \\ + $list_command \\ + $genetic_map_file_command \\ + $posfile_command \\ + $phasefile_command \\ + --chr=$chr \\ + --regionStart=$regions_start \\ + --regionEnd=$regions_end \\ + --nGen=$ngen \\ + --buffer=$buffer \\ + --nCores=$task.cpus \\ + --outputdir="." \\ + --reference_haplotype_file=$reference_haplotype_file \\ + --reference_legend_file=$reference_legend_file \\ + $args + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])") + r-quilt: \$(Rscript -e "cat(as.character(utils::packageVersion(\\"QUILT\\")))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/quilt/quilt/meta.yml b/modules/nf-core/quilt/quilt/meta.yml new file mode 100644 index 00000000..34c67a79 --- /dev/null +++ b/modules/nf-core/quilt/quilt/meta.yml @@ -0,0 +1,108 @@ +name: "quilt_quilt" +description: QUILT is an R and C++ program for rapid genotype imputation from low-coverage sequence using a large reference panel. +keywords: + - imputation + - low-coverage + - genotype + - genomics + - vcf +tools: + - "quilt": + description: "Read aware low coverage whole genome sequence imputation from a reference panel" + homepage: "https://github.com/rwdavies/quilt" + documentation: "https://github.com/rwdavies/quilt" + tool_dev_url: "https://github.com/rwdavies/quilt" + doi: "10.1038/s41588-021-00877-0" + licence: "['GPL v3']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bams: + type: file + description: (Mandatory) BAM/CRAM files + pattern: "*.{bam,cram,sam}" + - bais: + type: file + description: (Mandatory) BAM/CRAM index files + pattern: "*.{bai}" + - bamlist: + type: file + description: (Mandatory) "Path to file with bam file locations. File is one row per entry, path to bam files. Bam index files should exist in same directory as for each bam, suffixed either .bam.bai or .bai. + pattern: "*.{txt}" + - reference_haplotype_file: + type: file + description: (Mandatory) Reference haplotype file in IMPUTE format (file with no header and no rownames, one row per SNP, one column per reference haplotype, space separated, values must be 0 or 1) + pattern: "*.{hap.gz}" + - reference_legend_file: + type: file + description: (Mandatory) Reference haplotype legend file in IMPUTE format (file with one row per SNP, and a header including position for the physical position in 1 based coordinates, a0 for the reference allele, and a1 for the alternate allele). + pattern: "*.{legend.gz}" + - chr: + type: string + description: (Mandatory) What chromosome to run. Should match BAM headers. + - regions_start: + type: integer + description: (Mandatory) When running imputation, where to start from. The 1-based position x is kept if regionStart <= x <= regionEnd. + - regions_end: + type: integer + description: (Mandatory) When running imputation, where to stop. + - buffer: + type: integer + description: Buffer of region to perform imputation over. So imputation is run form regionStart-buffer to regionEnd+buffer, and reported for regionStart to regionEnd, including the bases of regionStart and regionEnd. + - ngen: + type: integer + description: Number of generations since founding or mixing. Note that the algorithm is relatively robust to this. Use nGen = 4 * Ne / K if unsure. + - genetic_map_file: + type: file + description: (Optional) File with genetic map information, a file with 3 white-space delimited entries giving position (1-based), genetic rate map in cM/Mbp, and genetic map in cM. If no file included, rate is based on physical distance and expected rate (expRate). + pattern: "*.{txt.gz}" + - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - posfile: + type: file + description: (Optional) File with positions of where to impute, lining up one-to-one with genfile. File is tab seperated with no header, one row per SNP, with col 1 = chromosome, col 2 = physical position (sorted from smallest to largest), col 3 = reference base, col 4 = alternate base. Bases are capitalized. + pattern: "*.{txt}" + - phasefile: + type: file + description: (Optional) File with truth phasing results. Supersedes genfile if both options given. File has a header row with a name for each sample, matching what is found in the bam file. Each subject is then a tab seperated column, with 0 = ref and 1 = alt, separated by a vertical bar |, e.g. 0|0 or 0|1. Note therefore this file has one more row than posfile which has no header. + pattern: "*.{txt}" + - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: (Optional) File with reference genome. + pattern: "*.{txt.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: VCF file with both SNP annotation information and per-sample genotype information. + pattern: "*.{vcf.gz}" + - tbi: + type: file + description: TBI file of the VCF. + pattern: "*.{vcf.gz.tbi}" + - RData: + type: directory + description: Optional directory path to prepared RData file with reference objects (useful with --save_prepared_reference=TRUE). +authors: + - "@atrigila" +maintainers: + - "@atrigila" diff --git a/nextflow.config b/nextflow.config index ba21f779..b459cd67 100644 --- a/nextflow.config +++ b/nextflow.config @@ -42,6 +42,10 @@ params { depth = 1 genotype = null + // QUILT + ngen = 100 + buffer = 10000 + // Boilerplate options outdir = null publish_dir_mode = 'copy' @@ -189,9 +193,10 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } - test_sim { includeConfig 'conf/test_sim.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_sim { includeConfig 'conf/test_sim.config' } + test_quilt { includeConfig 'conf/test_quilt.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile @@ -261,6 +266,7 @@ manifest { // Load modules.config for DSL2 module specific options includeConfig 'conf/modules.config' +includeConfig 'conf/quilt_subworkflow.config' // Function to ensure that resource requirements don't go beyond // a maximum limit diff --git a/nextflow_schema.json b/nextflow_schema.json index b1552439..afe40f62 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -385,5 +385,15 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "ngen": { + "type": "integer", + "default": 100 + }, + "buffer": { + "type": "integer", + "default": 10000 + } + } } diff --git a/subworkflows/local/impute_quilt/impute_quilt.nf b/subworkflows/local/impute_quilt/impute_quilt.nf new file mode 100644 index 00000000..decc61f2 --- /dev/null +++ b/subworkflows/local/impute_quilt/impute_quilt.nf @@ -0,0 +1,56 @@ +include { QUILT_QUILT } from '../../../modules/nf-core/quilt/quilt/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' + + +workflow IMPUTE_QUILT { + + take: + ch_hap_legend // channel: [ val(meta), hap, legend ] + ch_input // channel: [ val(meta), bam, bai ] + ch_chunks // channel: [ val(meta), start_coordinate, end_coordinate, number ] + + + main: + + ch_versions = Channel.empty() + + posfile = [] + phasefile = [] + posfile_phasefile = [[id: null], posfile, phasefile] + genetic_map_file = [] + fasta = [[id:'test'], []] + + ngen = params.ngen + buffer = params.buffer + + ch_bam_bamlist = ch_input + + if (genetic_map_file.isEmpty()) { + ch_hap_chunks = ch_hap_legend.combine(ch_chunks, by:0).map { it + ngen + buffer + [[]] } + } else { + // Add ngen and buffer + genetic map file (untested) + ch_hap_chunks = ch_hap_legend.join(ch_chunks, by:0).join(genetic_map_file) + } + + ch_quilt = ch_bam_bamlist.combine(ch_hap_chunks) + ch_quilt_input = ch_quilt.map { it.take(4) + it.drop(5) } + + // Add metamap with chromosome information + ch_quilt_input = ch_quilt_input + .map{ meta, bam, bai, bamlist, hap, legend, chr, start, end, ngen2, buffer2, genetic -> + return [['id': meta.id, 'chr': chr] , bam, bai, bamlist, hap, legend, chr, start, end, ngen2, buffer2, genetic] + } + + // Run QUILT + QUILT_QUILT ( ch_quilt_input, posfile_phasefile, fasta ) + + // Index imputed VCF + BCFTOOLS_INDEX(QUILT_QUILT.out.vcf) + + // Join VCFs and TBIs + ch_vcf_tbi = QUILT_QUILT.out.vcf.join(BCFTOOLS_INDEX.out.tbi) + + emit: + ch_vcf_tbi // channel: [ meta, vcf, tbi ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/make_chunks/make_chunks.nf b/subworkflows/local/make_chunks/make_chunks.nf new file mode 100644 index 00000000..c0fe7924 --- /dev/null +++ b/subworkflows/local/make_chunks/make_chunks.nf @@ -0,0 +1,67 @@ +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2} from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_3} from '../../../modules/nf-core/bcftools/index/main' +include { GLIMPSE_CHUNK } from '../../../modules/nf-core/glimpse/chunk/main' +include { BCFTOOLS_CONVERT } from '../../../modules/nf-core/bcftools/convert/main' +include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main' +include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' + + +workflow MAKE_CHUNKS { + + take: + ch_reference // channel: [ val(meta),vcf ] + ch_fasta_fai // channel: [meta, fasta, fai] + + main: + + ch_versions = Channel.empty() + + // Make chunks + ch_vcf_csi_chr = ch_reference.map{meta, vcf, csi -> [meta, vcf, csi, meta.chr]} + GLIMPSE_CHUNK(ch_vcf_csi_chr) + + // Rearrange chunks into channel + ch_chunks = GLIMPSE_CHUNK.out.chunk_chr + .splitText() + .map { metamap, line -> + def fields = line.split("\t") + def startEnd = fields[2].split(':')[1].split('-') + [metamap, metamap.chr, startEnd[0], startEnd[1]] + } + + ch_fasta = ch_fasta_fai.map { meta, fasta, fai -> [meta, fasta] } + + // Join duplicated biallelic sites into multiallelic records + BCFTOOLS_NORM(ch_reference, ch_fasta) + + // Index multiallelic VCF + BCFTOOLS_INDEX_2(BCFTOOLS_NORM.out.vcf) + + // Join multiallelic VCF and TBI + ch_multiallelic_vcf_tbi = BCFTOOLS_NORM.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) + + // Remove all multiallelic records: + BCFTOOLS_VIEW(ch_multiallelic_vcf_tbi, [], [], []) + + // Index biallelic VCF + BCFTOOLS_INDEX_3(BCFTOOLS_VIEW.out.vcf) + + // Join biallelic VCF and TBI + ch_biallelic_vcf_tbi = BCFTOOLS_VIEW.out.vcf.join(BCFTOOLS_INDEX_3.out.tbi) + + // Convert VCF to Hap and Legend files + BCFTOOLS_CONVERT(ch_biallelic_vcf_tbi, ch_fasta, []) + + // Output hap and legend files + ch_hap_legend = BCFTOOLS_CONVERT.out.hap.join(BCFTOOLS_CONVERT.out.legend) + + + + + + emit: + ch_chunks = ch_chunks // channel: [ chr, val(meta), start, end, number ] + ch_hap_legend = ch_hap_legend // channel: [ chr, val(meta), hap, legend ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf index 4ce4b840..bbfbf258 100644 --- a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf @@ -134,21 +134,21 @@ workflow PIPELINE_INITIALISATION { // // Create channel from region input // - if (params.input_region.endsWith(".csv")) { + if (params.input_region == null){ + // #TODO Add support for string input + GET_REGION ( + "all", + ch_ref_gen + ) + ch_versions = ch_versions.mix(GET_REGION.out.versions) + ch_regions = GET_REGION.out.regions + } else if (params.input_region.endsWith(".csv")) { println "Region file provided as input is a csv file" ch_regions = Channel.fromSamplesheet("input_region") .map{ chr, start, end -> [["chr": chr], chr + ":" + start + "-" + end]} .map{ metaC, region -> [metaC + ["region": region], region]} } else { error "Region file provided is of another format than CSV (not yet supported). Please separate your reference genome by chromosome and use the samplesheet format." - /* #TODO Wait for `oneOf()` to be supported in the nextflow_schema.json - GET_REGION ( - params.input_region, - ch_ref_gen - ) - ch_versions = ch_versions.mix(GET_REGION.out.versions.first()) - ch_regions = GET_REGION.out.regions - */ } // diff --git a/subworkflows/local/vcf_concatenate_bcftools/vcf_concatenate_bcftools.nf b/subworkflows/local/vcf_concatenate_bcftools/vcf_concatenate_bcftools.nf new file mode 100644 index 00000000..6653d765 --- /dev/null +++ b/subworkflows/local/vcf_concatenate_bcftools/vcf_concatenate_bcftools.nf @@ -0,0 +1,30 @@ +include { BCFTOOLS_CONCAT } from '../../../modules/nf-core/bcftools/concat/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' + +workflow VCF_CONCATENATE_BCFTOOLS { + + take: + ch_vcf_tbi // channel: [ val(meta), vcf, tbi ] + + main: + + // Remove chromosome from meta + ch_vcf_tbi_grouped = ch_vcf_tbi.map{ meta, vcf, tbi -> + return [['id' : meta.id], vcf, tbi] + } + // Group by ID + ch_vcf_tbi_grouped = ch_vcf_tbi_grouped.groupTuple( by:[0] ) + + // Ligate and concatenate chunks + BCFTOOLS_CONCAT(ch_vcf_tbi_grouped) + + // Index concatenated VCF + BCFTOOLS_INDEX(BCFTOOLS_CONCAT.out.vcf) + + // Join VCFs and TBIs + ch_imputed_vcf_tbi = BCFTOOLS_CONCAT.out.vcf.join(BCFTOOLS_INDEX.out.tbi) + + emit: + ch_imputed_vcf_tbi // channel: [ meta, vcf, tbi ] + + } diff --git a/tests/csv/panel_full.csv b/tests/csv/panel_full.csv index c86f442b..782b4a78 100644 --- a/tests/csv/panel_full.csv +++ b/tests/csv/panel_full.csv @@ -1,23 +1,23 @@ panel,chr,vcf,index -1000G_phased,chr1,s3://1000genomes/release/20130502/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr2,s3://1000genomes/release/20130502/ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr3,s3://1000genomes/release/20130502/ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr4,s3://1000genomes/release/20130502/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr5,s3://1000genomes/release/20130502/ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr6,s3://1000genomes/release/20130502/ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr7,s3://1000genomes/release/20130502/ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr8,s3://1000genomes/release/20130502/ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr9,s3://1000genomes/release/20130502/ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr10,s3://1000genomes/release/20130502/ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr11,s3://1000genomes/release/20130502/ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr12,s3://1000genomes/release/20130502/ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr13,s3://1000genomes/release/20130502/ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr14,s3://1000genomes/release/20130502/ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr15,s3://1000genomes/release/20130502/ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr16,s3://1000genomes/release/20130502/ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr17,s3://1000genomes/release/20130502/ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr18,s3://1000genomes/release/20130502/ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr19,s3://1000genomes/release/20130502/ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr20,s3://1000genomes/release/20130502/ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr21,s3://1000genomes/release/20130502/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi -1000G_phased,chr22,s3://1000genomes/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,s3://1000genomes/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi +1000GP.s.norel,chr1,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr1.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr1.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr2,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr2.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr2.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr3,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr3.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr3.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr4,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr4.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr4.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr5,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr5.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr5.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr6,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr6.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr6.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr7,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr7.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr7.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr8,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr8.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr8.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr9,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr9.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr9.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr10,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr10.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr10.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr11,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr11.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr11.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr12,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr12.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr12.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr13,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr13.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr13.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr14,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr14.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr14.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr15,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr15.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr15.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr16,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr16.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr16.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr17,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr17.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr17.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr18,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr18.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr18.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr19,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr19.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr19.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr20,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr21,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.filtered.shapeit2-duohmm-phased.vcf.gz.tbi +1000GP.s.norel,chr22,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.filtered.shapeit2-duohmm-phased.vcf.gz,http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.filtered.shapeit2-duohmm-phased.vcf.gz.tbi diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 8535f860..d07a1b8b 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -25,6 +25,11 @@ include { VCF_IMPUTE_GLIMPSE } from '../../subworkflows/nf-core/vcf_imp include { VCF_CHR_CHECK } from '../../subworkflows/local/vcf_chr_check' include { GET_PANEL } from '../../subworkflows/local/get_panel' + +include { MAKE_CHUNKS } from '../../subworkflows/local/make_chunks/make_chunks' +include { IMPUTE_QUILT } from '../../subworkflows/local/impute_quilt/impute_quilt' +include { VCF_CONCATENATE_BCFTOOLS } from '../../subworkflows/local/vcf_concatenate_bcftools/vcf_concatenate_bcftools' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -136,8 +141,32 @@ workflow PHASEIMPUTE { // Glimpse2 subworkflow } if (params.tools.contains("quilt")) { - error "Quilt not yet implemented" + print("Impute with quilt") + // Quilt subworkflow + + // Create chunks from reference VCF + MAKE_CHUNKS(ch_panel, ch_fasta) + + // Make bamlist from bam input + ch_bamlist = ch_input + .map { it[1].tokenize('/').last() } + .collectFile( name: "bamlist.txt", newLine: true, sort: true ) + + // Create input QUILT + ch_input_quilt = ch_input + .map { meta, bam, bai -> [["id": "all_samples"], bam, bai] } + .groupTuple () + .combine ( ch_bamlist ) + .collect () + + // Impute BAMs with QUILT + IMPUTE_QUILT(MAKE_CHUNKS.out.ch_hap_legend, ch_input_quilt, MAKE_CHUNKS.out.ch_chunks) + + // Concatenate results + VCF_CONCATENATE_BCFTOOLS(IMPUTE_QUILT.out.ch_vcf_tbi) + + } }