Merge pull request #234 from nschcolnicov/samplesheet_validator

Samplesheet validator
nf-core · Aug 12, 2024 · 119e8bc · 119e8bc
2 parents 8353819 + c95aa4b
commit 119e8bc
Show file tree

Hide file tree

Showing 31 changed files with 424 additions and 34 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#220](https://github.com/nf-core/demultiplex/pull/220) Added kraken2.
 - [#221](https://github.com/nf-core/demultiplex/pull/221) Added checkqc_config to pipeline schema.
 - [#225](https://github.com/nf-core/demultiplex/pull/225) Added test profile for multi-lane samples, updated handling of such samples and adapter trimming.
+- [#234](https://github.com/nf-core/demultiplex/pull/234) Added module for samplesheet validation.
 - [#236](https://github.com/nf-core/demultiplex/pull/236) Add samplesheet generation.
 
 ### `Changed`

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -22,6 +22,8 @@
 
 - [CheckQC](https://github.com/Molmed/checkQC)
 
+- [samshee](https://github.com/lit-regensburg/samshee)
+
 ## Software packaging/containerisation tools
 
 - [Anaconda](https://anaconda.com)

diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ On release, automated continuous integration tests run the pipeline on a full-si
 4. [Falco](#falco) - Raw read QC
 5. [md5sum](#md5sum) - Creates an MD5 (128-bit) checksum of every fastq.
 6. [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline
+7. [samshee](#samshee) - Validates illumina v2 samplesheets.
 
 ![subway map](docs/demultiplex.png)
 

diff --git a/bin/validate_samplesheet.py b/bin/validate_samplesheet.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+from samshee.samplesheetv2 import read_samplesheetv2
+from samshee.validation import illuminasamplesheetv2schema, illuminasamplesheetv2logic, validate
+import json
+import sys
+
+def validate_samplesheet(filename, custom_schema_file=None):
+    # Load the custom schema if provided
+    if custom_schema_file:
+        with open(custom_schema_file, 'r') as f:
+            custom_schema = json.load(f)
+        custom_validator = lambda doc: validate(doc, custom_schema)
+    else:
+        custom_validator = None
+
+    # Prepare the list of validators
+    validators = [illuminasamplesheetv2schema, illuminasamplesheetv2logic]
+    if custom_validator:
+        validators.append(custom_validator)
+    # Read and validate the sample sheet
+    try:
+        sheet = read_samplesheetv2(filename, validation=validators)
+        print(f"Validation successful for {filename}")
+    except Exception as e:
+        print(f"Validation failed: {e}")
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        print("Usage: validate_samplesheet.py <SampleSheet.csv> [custom_schema.json]")
+        sys.exit(1)
+    samplesheet_file = sys.argv[1]
+    schema_file = sys.argv[2] if len(sys.argv) == 3 else None
+
+    validate_samplesheet(samplesheet_file, schema_file)
diff --git a/conf/test.config b/conf/test.config
@@ -22,6 +22,7 @@ params {
     // Input data
     input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv'
     demultiplexer = 'bclconvert'
+    skip_tools = 'samshee'
 
 }
 

diff --git a/conf/test_bases2fastq.config b/conf/test_bases2fastq.config
@@ -20,6 +20,6 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/b2fq-samplesheet.csv'
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/b2fq-samplesheet.csv'
     demultiplexer = 'bases2fastq'
 }
diff --git a/conf/test_bcl2fastq.config b/conf/test_bcl2fastq.config
@@ -20,9 +20,9 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv'
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv'
     demultiplexer = 'bcl2fastq'
-    skip_tools    = "checkqc"
+    skip_tools    = "checkqc,samshee"
 
 }
 

diff --git a/conf/test_checkqc.config b/conf/test_checkqc.config
@@ -16,9 +16,9 @@ params {
 
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv'
+    input           = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv'
     demultiplexer   = 'bcl2fastq'
-    skip_tools      = "fastp,falco,md5sum,multiqc"
+    skip_tools      = "fastp,falco,md5sum,multiqc,samshee"
     checkqc_config  = "${projectDir}/assets/checkqc_config.yaml"
 
 }

diff --git a/conf/test_fqtk.config b/conf/test_fqtk.config
@@ -20,6 +20,6 @@ params {
     max_time   = '1.h'
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/fqtk-samplesheet.csv'
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/fqtk-samplesheet.csv'
     demultiplexer = 'fqtk'
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -13,6 +13,9 @@
 params {
     config_profile_name        = 'Full test profile'
     config_profile_description = 'Full test dataset to check pipeline function'
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv'
+
+    // Input data
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv'
     demultiplexer = 'bcl2fastq'
+    skip_tools    = 'samshee'
 }
diff --git a/conf/test_kraken.config b/conf/test_kraken.config
@@ -13,8 +13,11 @@
 params {
     config_profile_name        = 'Test full kraken profile'
     config_profile_description = 'Full test dataset to check pipeline function with kraken'
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv'
+
+    // Input data
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv'
     demultiplexer = 'bcl2fastq'
     kraken_db     = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/db/kraken2.tar.gz'
+    skip_tools    = 'samshee'
 }
 
diff --git a/conf/test_mkfastq.config b/conf/test_mkfastq.config
@@ -20,6 +20,7 @@ params {
     max_time   = '1.h'
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/mkfastq-samplesheet.csv'
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/mkfastq-samplesheet.csv'
     demultiplexer = 'mkfastq'
+    skip_tools    = 'samshee'
 }
diff --git a/conf/test_pe.config b/conf/test_pe.config
@@ -13,7 +13,9 @@
 params {
     config_profile_name        = 'Paired end test profile'
     config_profile_description = 'Paired end test dataset to check pipeline function'
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/pe_samplesheet.csv'
+
+    // Input data
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/pe_samplesheet.csv'
     demultiplexer = 'bcl2fastq'
-    skip_tools    = "checkqc"
+    skip_tools    = "checkqc,samshee"
 }
diff --git a/conf/test_sgdemux.config b/conf/test_sgdemux.config
@@ -20,6 +20,6 @@ params {
     max_time   = '1.h'
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/sgdemux-samplesheet.csv'
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/sgdemux-samplesheet.csv'
     demultiplexer = 'sgdemux'
 }
diff --git a/conf/test_two_lanes.config b/conf/test_two_lanes.config
@@ -15,7 +15,7 @@ params {
     config_profile_description = 'Minimal test dataset to check pipeline function with multiple lanes'
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/two_lane_samplesheet.csv'
+    input         = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/two_lane_samplesheet.csv'
     demultiplexer = 'bclconvert'
     skip_tools    = "checkqc"
 }
diff --git a/conf/test_uncompressed.config b/conf/test_uncompressed.config
@@ -20,8 +20,9 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input = 'https://github.com/nf-core/test-datasets/raw/demultiplex/samplesheet/1.3.0/uncompressed-samplesheet.csv'
+    input         = 'https://github.com/nf-core/test-datasets/raw/demultiplex/samplesheet/1.3.0/uncompressed-samplesheet.csv'
     demultiplexer = 'bclconvert'
+    skip_tools    = 'samshee'
 
 }
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -221,6 +221,10 @@ To learn how to provide additional arguments to a particular tool of the pipelin
 
 The trimming process in our demultiplexing pipeline has been updated to ensure compatibility with 10x Genomics recommendations. By default, trimming in the pipeline is performed using fastp, which reliably auto-detects and removes adapter sequences without the need for storing adapter sequences. As users can also supply adapter sequences in a samplesheet and thereby triggering trimming in any `bcl2fastq` or `bclconvert` subworkflows, we have added a new parameter, `remove_adapter`, which is set to true by default. When `remove_adapter` is true, the pipeline automatically removes any adapter sequences listed in the `[Settings]` section of the Illumina sample sheet, replacing them with an empty string in order to not provoke this behaviour. This approach aligns with 10x Genomics' guidelines, as they advise against pre-processing FASTQ reads before inputting them into their software pipelines. If the `remove_adapter` setting is true but no adapter is removed, a warning will be displayed; however, this does not necessarily indicate an error, as some sample sheets may already lack these adapter sequences. Users can disable this behavior by setting `--remove_adapter false` in the command line, though this is not recommended.
 
+## samshee (Samplesheet validator)
+
+samshee ensures the integrity of Illumina v2 Sample Sheets by allowing users to apply custom validation rules. The module can be used together with the parameter `--validator_schema`, which accepts a JSON schema validator file. Users can specify this file to enforce additional validation rules beyond the default ones provided by the tool. To use this feature, simply provide the path to the JSON schema validator file via the `--validator_schema` parameter in the pipeline configuration. This enables tailored validation of Sample Sheets to meet specific requirements or standards relevant to your sequencing workflow. For more information about the tool or how to write the schema JSON file, please refer to [Samshee on GitHub](https://github.com/lit-regensburg/samshee).
+
 ### nf-core/configs
 
 In most cases, you will only need to create a custom config as a one-off but if you and others within your organisation are likely to be running nf-core pipelines regularly and need to use the same settings regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter. You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile.

diff --git a/modules/local/samshee/README.md b/modules/local/samshee/README.md
@@ -0,0 +1,84 @@
+# Guide to Writing a `validation.json` Schema File
+
+## Introduction
+
+A JSON schema defines the structure and constraints of JSON data. This guide will help you create a `validation.json` schema file for use with Samshee to perform additional checks on Illumina® Sample Sheet v2 files.
+
+## JSON Schema Basics
+
+JSON Schema is a powerful tool for validating the structure of JSON data. It allows you to specify required fields, data types, and constraints. Here are some common components:
+
+- **`$schema`**: Declares the JSON Schema version being used.
+- **`type`**: Specifies the data type (e.g., `object`, `array`, `string`, `number`).
+- **`properties`**: Defines the properties of an object and their constraints.
+- **`required`**: Lists properties that must be present in the object.
+- **`items`**: Specifies the schema for items in an array.
+
+## Example Schema
+
+Here’s an example of a `validation.json` schema file for an Illumina® Sample Sheet:
+
+```json
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "Header": {
+      "type": "object",
+      "properties": {
+        "InvestigatorName": {
+          "type": "string"
+        },
+        "ExperimentName": {
+          "type": "string"
+        }
+      },
+      "required": ["InvestigatorName", "ExperimentName"]
+    },
+    "Reads": {
+      "type": "object",
+      "properties": {
+        "Read1": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "Read2": {
+          "type": "integer",
+          "minimum": 1
+        }
+      },
+      "required": ["Read1", "Read2"]
+    },
+    "BCLConvert": {
+      "type": "object",
+      "properties": {
+        "Index": {
+          "type": "string",
+          "pattern": "^[ACGT]{8}$" // Example pattern for 8-base indices
+        }
+      }
+    }
+  },
+  "required": ["Header", "Reads"]
+}
+```
+
+### Explanation of the Example
+
+- **`$schema`**: Specifies the JSON Schema version (draft-07).
+- **`type`**: Defines the main type as `object`.
+- **`properties`**: Lists the properties of the object:
+- **`Header`**: An object with required `InvestigatorName` and `ExperimentName` fields.
+- **`Reads`**: An object with required `Read1` and `Read2` fields that must be integers greater than or equal to 1.
+- **`BCLConvert`**: An object with an optional `Index` field that must be a string matching a pattern for 8-base indices.
+- **`required`**: Lists required properties at the top level.
+
+### Tips for Writing JSON Schemas
+
+1. **Start Simple**: Begin with basic constraints and gradually add complexity.
+2. **Use Online Validators**: Validate your schema using online tools to ensure it adheres to the JSON Schema specification.
+3. **Refer to Schema Documentation**: Consult the [JSON Schema documentation](https://json-schema.org/) for detailed guidance.
+
+### Conclusion
+
+By defining a JSON schema, you can enforce specific rules and ensure that your Illumina® Sample Sheet v2 files meet your required structure and constraints. Use this guide to create and validate your `validation.json` schema files effectively.
diff --git a/modules/local/samshee/environment.yml b/modules/local/samshee/environment.yml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python>=3.9
+  - pip
+  - pip: # FIXME https://github.com/nf-core/modules/issues/5814
+      - samshee==0.1.12