Merge pull request #115 from nf-core/113-enable-liftover-option-for-t…

…est-vcfs 113 enable liftover option for test vcfs
nf-core · Dec 5, 2024 · e26667d · e26667d
2 parents 68b165f + f7c496c
commit e26667d
Show file tree

Hide file tree

Showing 20 changed files with 421 additions and 333 deletions.
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -154,6 +154,12 @@
                 "enum": ["sc", "cts", "d"],
                 "minLength": 1,
                 "default": null
+            },
+            "liftover": {
+                "type": "boolean",
+                "description": "Liftover option for test vcfs, to activate add params.liftover='test' ",
+                "meta": ["liftover"],
+                "default": false
             }
         },
         "required": ["test_vcf", "caller", "id"]

diff --git a/conf/tests/liftover_hg37.config → conf/tests/liftover_test.config b/conf/tests/liftover_hg37.config → conf/tests/liftover_test.config
@@ -20,22 +20,23 @@ params {
     max_time              = '8.h'
 
     // Input data
-    input                = 'https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/samplesheet_sv_hg37.csv'
+    input                = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/samplesheet_sv_liftover.csv"
     outdir               = 'results'
     genome               = 'GRCh37'
 
     // Processes
     analysis             = 'germline'
     variant_type         = "structural"
     method               = 'truvari'
-    preprocess           = "normalization,deduplication,filter_contigs"
+    preprocess           = "filter_contigs"
     min_sv_size          = 30
 
     truth_id             = "HG002"
     truth_vcf            = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.vcf.gz"
+    regions_bed          = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v01.ch21.bed"
 
     //liftover files
-    liftover              = true
+    liftover              = "test"
     chain                 = "http://ftp.ensembl.org/pub/assembly_mapping/homo_sapiens/GRCh38_to_GRCh37.chain.gz"
     rename_chr            = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/grch38_grch37.txt"
 }
diff --git a/conf/tests/liftover_hg38.config → conf/tests/liftover_truth.config b/conf/tests/liftover_hg38.config → conf/tests/liftover_truth.config
@@ -35,7 +35,7 @@ params {
     regions_bed          = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/truth/HG002_GRCh37_1_22_v4.2.1_highconf.bed"
 
     //liftover files
-    liftover             = true
+    liftover             = "truth"
     chain                = "https://raw.githubusercontent.com/broadinstitute/gatk/master/scripts/funcotator/data_sources/gnomAD/b37ToHg38.over.chain"
     rename_chr           = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/grch37_grch38.txt"
 }
diff --git a/docs/truth.md b/docs/truth.md
@@ -1,7 +1,5 @@
 # nf-core/variantbenchmarking: Truth files
 
-## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/variantbenchmarking/truth](https://nf-co.re/variantbenchmarking/truth)
-
 ## Defining Truth VCF and High confidence BED files
 
 This pipeline requires a set of Truth VCF, as a baseline for comparisons, and a high confidence bed files, to restrict analysis to regions. Although, those sets can be anything depending on the type of the analysis, for benchmarking of human genomes there are golden set of samples provided by [Genome in a Bottle project](https://www.nist.gov/programs-projects/genome-bottle) and [SEQC2 consortium](https://sites.google.com/view/seqc2/home/data-analysis/high-confidence-somatic-snv-and-indel-v1-2).

diff --git a/docs/usage.md b/docs/usage.md
@@ -41,14 +41,28 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p
 
 ## Truth samples
 
-Please find the detailed information about truth samples [here](https://nf-co.re/variantbenchmarking/truth).
+Please find the detailed information about truth samples [here](../docs/truth.md).
 
 ## Lifting over truth sets
 
-This workflow comes with a liftover option for truth sets. In order to activate liftover use `--liftover true`.
+This workflow comes with a liftover option for truth sets. In order to activate liftover use `--liftover "truth"`.
 
 - `--chain`: This workflow uses picard tools for lifting over and a chain file has to be provided specific to the input truth vcf. Some examples can be found [here](https://genome.ucsc.edu/goldenPath/help/chain.html)
 - `--rename_chr`: Renaming chromosomes is required after liftover process. Some examples can be found under `assets/rename_contigs` directory.
+- `--dictionary`: .dict file is required to run liftover process. If dictionary file is not provided, picard createsequencedictionary will create and use the file.
+
+## Lifting over test sets
+
+Lifting over test samples is also possible through this pipeline, if you want to liftover at least one of the samples first use `--liftover "test"` and add liftover option to samplesheet:
+
+```csv title="samplesheet.csv"
+id,test_vcf,caller,liftover
+test1,test1.vcf.gz,delly,true
+test2,test2.vcf,gatk,false
+test3,test3.vcf.gz,cnvkit,true
+```
+
+Please note that you should still provide chain and reame_chr files, and lifting over truth and test samples simultaneously is not possible.
 
 ## Standardization and normalization parameters
 
@@ -234,10 +248,10 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof
 - `test_full`
   - A profile with a complete configuration for full size of sample testing
   - Includes links to test data so needs no other parameters
-- `liftover_hg37`
-  - A profile with a complete configuration for using liftover of HG002 hg38 truth set to hg37
+- `liftover_test`
+  - A profile with a complete configuration for using liftover of HG002 hg38 test set to hg37
   - Includes links to test data so needs no other parameters
-- `liftover_hg38`
+- `liftover_truth`
   - A profile with a complete configuration for using liftover of HG002 hg37 truth set to hg38
   - Includes links to test data so needs no other parameters
 - `germline_small`

diff --git a/nextflow.config b/nextflow.config
@@ -48,7 +48,7 @@ params {
     dictionary                 = null
     rename_chr                 = null
     chain                      = null
-    liftover                   = false
+    liftover                   = ""
 
     // MultiQC options
     multiqc_config             = null
@@ -193,8 +193,8 @@ profiles {
     somatic_snv    { includeConfig 'conf/tests/somatic_snv.config'   }
     somatic_indel  { includeConfig 'conf/tests/somatic_indel.config' }
     somatic_sv     { includeConfig 'conf/tests/somatic_sv.config'    }
-    liftover_hg37  { includeConfig 'conf/tests/liftover_hg37.config' }
-    liftover_hg38  { includeConfig 'conf/tests/liftover_hg38.config' }
+    liftover_test  { includeConfig 'conf/tests/liftover_test.config' }
+    liftover_truth { includeConfig 'conf/tests/liftover_truth.config' }
 
 }
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -197,18 +197,19 @@
                     "default": "s3://ngi-igenomes/igenomes/"
                 },
                 "liftover": {
-                    "type": "boolean",
-                    "description": "Run liftover workflow",
+                    "type": "string",
+                    "description": "Run liftover workflow: test,truth",
                     "fa_icon": "fas fa-ban",
                     "hidden": true,
-                    "help_text": "Makes the use of liftover subworkflow, hg37 truth sets will liftover to hg38 and visa versa. Has to be either combined with itruth.config or --chain and --rename_chr."
+                    "pattern": "^((test|truth)?,?)*(?<!,)$",
+                    "help_text": "Makes the use of liftover subworkflow, hg37 truth sets will liftover to hg38 and visa versa. Has to be either combined with --chain and --rename_chr."
                 },
                 "chain": {
                     "type": "string",
                     "format": "file-path",
                     "exists": true,
                     "pattern": "^\\S+\\.(chain|bed)?(\\.gz)?$",
-                    "description": "Path to the chain file sey required for liftover.",
+                    "description": "Path to the chain file required for liftover.",
                     "help_text": "This parameter is *mandatory* if `--liftover` is true",
                     "fa_icon": "fas fa-file-csv"
                 },

diff --git a/subworkflows/local/liftover_vcfs_truth.nf → subworkflows/local/liftover_vcfs.nf b/subworkflows/local/liftover_vcfs_truth.nf → subworkflows/local/liftover_vcfs.nf
@@ -1,5 +1,5 @@
 //
-// LIFTOVER_VCFS_TRUTH: SUBWORKFLOW TO LIFTOVER TRUTH VCFS HG37 TO HG38 OR HG38 TO HG37
+// LIFTOVER_VCFS: SUBWORKFLOW TO LIFTOVER VCFS HG37 TO HG38 OR HG38 TO HG37
 //
 
 include { PICARD_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/picard/createsequencedictionary'
@@ -11,10 +11,10 @@ include { SORT_BED                        } from '../../modules/local/custom/sor
 include { BEDTOOLS_MERGE                  } from '../../modules/nf-core/bedtools/merge'
 
 
-workflow LIFTOVER_VCFS_TRUTH {
+workflow LIFTOVER_VCFS {
     take:
-    truth_ch        // channel: [val(meta), vcf]
-    high_conf_ch    // channel: [bed]
+    ch_vcf          // channel: [val(meta), vcf]
+    ch_bed          // channel: [bed]
     fasta           // reference channel [val(meta), ref.fa]
     chain           // chain channel [val(meta), chain.gz]
     rename_chr      // reference channel [val(meta), chrlist.txt]
@@ -35,7 +35,7 @@ workflow LIFTOVER_VCFS_TRUTH {
 
     // Use picard liftovervcf tool to convert vcfs
     PICARD_LIFTOVERVCF(
-        truth_ch,
+        ch_vcf,
         dictionary,
         fasta,
         chain
@@ -56,9 +56,9 @@ workflow LIFTOVER_VCFS_TRUTH {
     )
     vcf_ch = BCFTOOLS_RENAME_CHR.out.vcf
 
-    // liftover high confidence file if given
+    // liftover high confidence bed file if given
     UCSC_LIFTOVER(
-        high_conf_ch.map{file -> tuple([id: params.truth_id], file)},
+        ch_bed.map{file -> tuple([id: params.truth_id], file)},
         chain.map{_meta, file -> file}
     )
     versions = versions.mix(UCSC_LIFTOVER.out.versions.first())

diff --git a/subworkflows/local/prepare_vcfs_test.nf b/subworkflows/local/prepare_vcfs_test.nf
@@ -2,13 +2,14 @@
 // PREPARE_VCFS: SUBWORKFLOW TO PREPARE INPUT VCFS
 //
 
-include { VCF_REHEADER_SAMPLENAME     } from '../local/vcf_reheader_samplename'
-include { VCF_VARIANT_DEDUPLICATION   } from '../local/vcf_variant_deduplication'
-include { VCF_VARIANT_FILTERING       } from '../local/vcf_variant_filtering'
-include { SPLIT_SMALL_VARIANTS_TEST   } from '../local/split_small_variants_test'
-include { BCFTOOLS_NORM               } from '../../modules/nf-core/bcftools/norm'
-include { TABIX_BGZIPTABIX            } from '../../modules/nf-core/tabix/bgziptabix'
-include { TABIX_TABIX                 } from '../../modules/nf-core/tabix/tabix'
+include { VCF_REHEADER_SAMPLENAME      } from '../local/vcf_reheader_samplename'
+include { VCF_VARIANT_DEDUPLICATION    } from '../local/vcf_variant_deduplication'
+include { VCF_VARIANT_FILTERING        } from '../local/vcf_variant_filtering'
+include { SPLIT_SMALL_VARIANTS_TEST    } from '../local/split_small_variants_test'
+include { BCFTOOLS_NORM                } from '../../modules/nf-core/bcftools/norm'
+include { TABIX_BGZIPTABIX             } from '../../modules/nf-core/tabix/bgziptabix'
+include { TABIX_TABIX                  } from '../../modules/nf-core/tabix/tabix'
+include { LIFTOVER_VCFS                } from '../local/liftover_vcfs'
 include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CONTIGS } from '../../modules/nf-core/bcftools/view'
 
 
@@ -17,14 +18,35 @@ workflow PREPARE_VCFS_TEST {
     test_ch     // channel: [val(meta), vcf]
     fasta       // reference channel [val(meta), ref.fa]
     fai         // reference channel [val(meta), ref.fa.fai]
+    chain       // reference channel [val(meta), chain.gz]
+    rename_chr  // reference channel [val(meta), chrlist.txt]
+    dictionary  // reference channel [val(meta), genome.dict]
 
     main:
 
     versions = Channel.empty()
 
+    test_ch.branch{
+        def meta = it[0]
+        liftover: meta.liftover
+        other: true}.set{vcf}
+
+    vcf_ch = Channel.empty()
+
+    LIFTOVER_VCFS(
+        vcf.liftover,
+        Channel.empty(),
+        fasta,
+        chain,
+        rename_chr,
+        dictionary
+    )
+    versions = versions.mix(LIFTOVER_VCFS.out.versions.first())
+    vcf_ch = vcf_ch.mix(LIFTOVER_VCFS.out.vcf_ch,vcf.other)
+
     // Add "query" to test sample
     VCF_REHEADER_SAMPLENAME(
-        test_ch,
+        vcf_ch,
         fai
     )
     versions = versions.mix(VCF_REHEADER_SAMPLENAME.out.versions.first())

diff --git a/subworkflows/local/prepare_vcfs_truth.nf b/subworkflows/local/prepare_vcfs_truth.nf
@@ -7,7 +7,7 @@ include { BCFTOOLS_NORM              } from '../../modules/nf-core/bcftools/norm
 include { TABIX_TABIX                } from '../../modules/nf-core/tabix/tabix'
 include { VCF_REHEADER_SAMPLENAME    } from '../local/vcf_reheader_samplename'
 include { VCF_VARIANT_DEDUPLICATION  } from '../local/vcf_variant_deduplication'
-include { LIFTOVER_VCFS_TRUTH        } from '../local/liftover_vcfs_truth'
+include { LIFTOVER_VCFS              } from '../local/liftover_vcfs'
 
 
 workflow PREPARE_VCFS_TRUTH {
@@ -25,19 +25,19 @@ workflow PREPARE_VCFS_TRUTH {
     versions = Channel.empty()
 
     // if liftover option is set convert truth files
-    if (params.liftover){
+    if (params.liftover.contains("truth")){
 
-        LIFTOVER_VCFS_TRUTH(
+        LIFTOVER_VCFS(
             truth_ch,
             high_conf_ch,
             fasta,
             chain,
             rename_chr,
             dictionary
         )
-        versions = versions.mix(LIFTOVER_VCFS_TRUTH.out.versions.first())
-        truth_ch = LIFTOVER_VCFS_TRUTH.out.vcf_ch
-        high_conf_ch = LIFTOVER_VCFS_TRUTH.out.bed_ch.map{ _meta, bed -> [bed]}
+        versions = versions.mix(LIFTOVER_VCFS.out.versions.first())
+        truth_ch = LIFTOVER_VCFS.out.vcf_ch
+        high_conf_ch = LIFTOVER_VCFS.out.bed_ch.map{ _meta, bed -> [bed]}
     }
 
     // Reheader sample name for truth file - using meta.caller

diff --git a/tests/germline_small.nf.test.snap b/tests/germline_small.nf.test.snap
@@ -1,7 +1,7 @@
 {
     "-stub": {
         "content": [
-            76,
+            77,
             {
                 "BCFTOOLS_DEDUP": {
                     "bcftools": 1.18
@@ -33,6 +33,9 @@
                 "MERGE_REPORTS": {
                     "python": "3.8.6"
                 },
+                "PICARD_CREATESEQUENCEDICTIONARY": {
+                    "picard": "3.2.0-1-g3948afb6b"
+                },
                 "PLOTS": {
                     "r-base": "4.3.1"
                 },
@@ -62,6 +65,8 @@
                 "pipeline_info",
                 "pipeline_info/nf_core_pipeline_software_mqc_versions.yml",
                 "references",
+                "references/dictionary",
+                "references/dictionary/genome.dict",
                 "references/rtgtools",
                 "references/rtgtools/genome.sdf",
                 "small",
@@ -231,11 +236,11 @@
             "nf-test": "0.9.0",
             "nextflow": "24.10.2"
         },
-        "timestamp": "2024-11-28T16:47:47.024615694"
+        "timestamp": "2024-12-04T10:12:17.779754389"
     },
     "Params: --analysis 'germline' --variant_type 'small' --method 'happy,rtgtools'": {
         "content": [
-            76,
+            77,
             {
                 "BCFTOOLS_DEDUP": {
                     "bcftools": 1.18
@@ -267,6 +272,9 @@
                 "MERGE_REPORTS": {
                     "python": "3.8.6"
                 },
+                "PICARD_CREATESEQUENCEDICTIONARY": {
+                    "picard": "3.2.0-1-g3948afb6b"
+                },
                 "PLOTS": {
                     "r-base": "4.3.1"
                 },
@@ -296,6 +304,8 @@
                 "pipeline_info",
                 "pipeline_info/nf_core_pipeline_software_mqc_versions.yml",
                 "references",
+                "references/dictionary",
+                "references/dictionary/genome.dict",
                 "references/rtgtools",
                 "references/rtgtools/genome.sdf",
                 "references/rtgtools/genome.sdf/done",
@@ -575,6 +585,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.10.2"
         },
-        "timestamp": "2024-11-28T16:45:26.990755841"
+        "timestamp": "2024-12-04T10:10:09.338722844"
     }
 }