Test resources: replace cellranger reference with ARC variant (#864)

openpipelines-bio · Aug 22, 2024 · 8ab10e4 · 8ab10e4
1 parent 5b0a66d
commit 8ab10e4
Show file tree

Hide file tree

Showing 8 changed files with 96 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,8 @@
 
 * `reference/cellranger_mkgtf`: Rename `reference/mkgtf` to `reference/cellranger_mkgtf` (PR #846).
 
+* `reference/build_cellranger_arc_reference`: a default value of "output" is now specified for the argument `--genome`, inline with `reference/build_cellranger_reference` component. Additionally, providing a value for `--organism` is no longer required and its default value of `Homo Sapiens` has been removed (PR #864).
+
 ## NEW FUNCTIONALITY
 
 * `process_samples`, `process_batches` and `rna_multisample` workflows: added functionality to scale the log-normalized 
@@ -77,6 +79,10 @@
 
 * `dimred/densmap` component: Added a densMAP dimensionality reduction component (PR #748).
 
+* `workflows/ingestion/make_reference`: add possibility to build CellRanger ARC references. Added `--motifs_file`, `--non_nuclear_contigs` and `--output_cellranger_arc` arguments (PR #864).
+
+* Test resources (reference_gencodev41_chr1): switch reference genome for CellRanger to ARC variant (PR #864).
+
 ## MINOR CHANGES
 
 * `neighbors/find_neighbors` component: Modified to include results of KNN in the output file (PR #748).

diff --git a/resources_test_scripts/cellranger_atac_tiny_bcl.sh b/resources_test_scripts/cellranger_atac_tiny_bcl.sh
@@ -53,26 +53,6 @@ if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
     --output "${OUT}/bcl/sample_sheet.csv"
 fi
 
-# Download JASPAR files for reference building
-# Source of the code below: https://support.10xgenomics.com/single-cell-atac/software/release-notes/references#GRCh38-2020-A-2.0.0
-motifs_url="https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
-motifs_in="${REFERENCE_DIR}/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
-
-if [ ! -f "$motifs_in" ]; then
-    curl -sS "$motifs_url" > "$motifs_in"
-fi
-
-# Change motif headers so the human-readable motif name precedes the motif
-# identifier. So ">MA0004.1    Arnt" -> ">Arnt_MA0004.1".
-motifs_modified="${REFERENCE_DIR}/$(basename "$motifs_in").modified"
-awk '{
-    if ( substr($1, 1, 1) == ">" ) {
-        print ">" $2 "_" substr($1,2)
-    } else {
-        print
-    }
-}' "$motifs_in" > "$motifs_modified"
-
 if [ ! -d "${OUT}/fastqs" ]; then
   mkdir -p "$OUT/fastqs"
 

diff --git a/resources_test_scripts/ref_gencodev41_chr1.sh b/resources_test_scripts/ref_gencodev41_chr1.sh
@@ -14,21 +14,48 @@ mkdir -p "$OUT"
 
 wget "https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" -O "$OUT/ERCC92.zip"
 
+# Download JASPAR files for reference building
+# Source of the code below: https://support.10xgenomics.com/single-cell-atac/software/release-notes/references#GRCh38-2020-A-2.0.0
+motifs_url="https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
+motifs_in="${OUT}/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
+
+if [ ! -f "$motifs_in" ]; then
+    curl -sS "$motifs_url" > "$motifs_in"
+fi
+
+# Change motif headers so the human-readable motif name precedes the motif
+# identifier. So ">MA0004.1    Arnt" -> ">Arnt_MA0004.1".
+motifs_modified="${OUT}/$(basename "$motifs_in").modified"
+awk '{
+    if ( substr($1, 1, 1) == ">" ) {
+        print ">" $2 "_" substr($1,2)
+    } else {
+        print
+    }
+}' "$motifs_in" > "$motifs_modified"
+
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: "$ID"
+    genome_fasta: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz"
+    transcriptome_gtf: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz"
+    target: ["bd_rhapsody", "cellranger_arc"] 
+    output_fasta: "reference.fa.gz"
+    output_gtf: "reference.gtf.gz"
+    non_nuclear_contigs: null
+    output_cellranger_arc: "reference_cellranger.tar.gz"
+    output_bd_rhapsody: "reference_bd_rhapsody.tar.gz"
+    bdrhap_extra_star_params: "--genomeSAindexNbases 12 --genomeSAsparseD 2"
+    motifs_file: "$motifs_modified"
+    subset_regex: "chr1"
+HERE
+
 nextflow \
   run . \
   -main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
   -profile docker \
-  --id "$ID" \
   -c ./src/workflows/utils/labels_ci.config \
-  --genome_fasta "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" \
-  --transcriptome_gtf "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" \
-  --target "cellranger;bd_rhapsody" \
-  --output_fasta "reference.fa.gz" \
-  --output_gtf "reference.gtf.gz" \
-  --output_cellranger "reference_cellranger.tar.gz" \
-  --output_bd_rhapsody "reference_bd_rhapsody.tar.gz" \
-  --output_state state.yaml \
-  --bdrhap_extra_star_params '--genomeSAindexNbases 12 --genomeSAsparseD 2' \
-  --subset_regex "chr1" \
+  -params-file /tmp/params.yaml \
   --publish_dir $OUT \
   -resume
diff --git a/src/reference/build_cellranger_arc_reference/config.vsh.yaml b/src/reference/build_cellranger_arc_reference/config.vsh.yaml
@@ -38,12 +38,17 @@ functionality:
     - type: string
       name: --genome
       required: true
+      default: "output"
       description: Name of the genome. This will be the name of the intermediate output folder
       example: GRCh38
     - type: string
       name: --organism
-      default: Homo_sapiens
+      required: false
       description: Name of the organism. This is displayed in the web summary but is otherwise not used in the analysis.
+    - type: string
+      name: --subset_regex
+      description: Will subset the reference chromosomes using the given regex.
+      example: (ERCC-00002|chr1)
   resources:
     - type: bash_script
       path: script.sh

diff --git a/src/reference/build_cellranger_arc_reference/script.sh b/src/reference/build_cellranger_arc_reference/script.sh
@@ -58,7 +58,7 @@ else
 fi
 
 echo """{
-    organism: \"${par_organism}\"
+    ${par_organism:+organism: \"$par_organism\"}
     genome: [\"${par_genome}\"]
     input_fasta: [\""${tmpdir}/genome.fa"\"]
     input_gtf: [\""${par_annotation_gtf}\""]

diff --git a/src/reference/make_reference/script.sh b/src/reference/make_reference/script.sh
@@ -18,6 +18,11 @@ function clean_up {
 }
 trap clean_up EXIT
 
+echo "> Getting path of fasta file"
+par_genome_fasta=$(realpath $par_genome_fasta)
+echo "> Getting path of annotation file"
+par_transcriptome_gtf=$(realpath $par_transcriptome_gtf)
+
 echo "> Processing genome sequence"
 genome_fasta="$tmpdir/genome_sequence.fa"
 # if genome is gzipped, extract. otherwise not

diff --git a/src/workflows/ingestion/make_reference/config.vsh.yaml b/src/workflows/ingestion/make_reference/config.vsh.yaml
@@ -85,11 +85,27 @@ functionality:
           description: Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line.
           example: --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11
           required: false
+    - name: "Cellranger ARC options"
+      arguments:
+        - name: "--motifs_file"
+          type: file
+          direction: input
+          description: Path to file containing transcription factor motifs in JASPAR format.
+        - name: "--non_nuclear_contigs"
+          multiple: true
+          required: false
+          type: string
+          description: |
+            Name(s) of contig(s) that do not have any chromatin structure, for example, 
+            mitochondria or plastids. These contigs are excluded from peak calling since
+            the entire contig will be "open" due to a lack of chromatin structure.
+            Leave empty if there are no such contigs.
+
     - name: Outputs
       arguments:
         - type: string
           name: --target
-          choices: [ cellranger, bd_rhapsody, star ]
+          choices: [ cellranger, cellranger_arc, bd_rhapsody, star ]
           description: Which reference indices to generate. 
           multiple: true
           default: [ star ]
@@ -108,6 +124,11 @@ functionality:
           direction: output
           description: Output index
           example: cellranger_index.tar.gz
+        - type: file
+          name: --output_cellranger_arc
+          direction: output
+          description: Output index
+          example: cellranger_index_arc.tar.gz
         - type: file
           name: --output_bd_rhapsody
           direction: output
@@ -130,6 +151,7 @@ functionality:
     - name: reference/build_bdrhap_reference
     - name: reference/build_star_reference
     - name: reference/build_cellranger_reference
+    - name: reference/build_cellranger_arc_reference
   resources:
     - type: nextflow_script
       path: main.nf

diff --git a/src/workflows/ingestion/make_reference/main.nf b/src/workflows/ingestion/make_reference/main.nf
@@ -26,6 +26,21 @@ workflow run_wf {
         "output_gtf": "output_gtf"
       ]
     )
+    | build_cellranger_arc_reference.run(
+      runIf: { id, state ->
+        state.target.contains("cellranger_arc")
+      },
+      fromState: [
+        "genome_fasta": "output_fasta",
+        "annotation_gtf": "output_gtf",
+        "output": "output_cellranger_arc",
+        "motifs_file": "motifs_file",
+        "non_nuclear_contigs": "non_nuclear_contigs",
+      ],
+      toState: [
+        "output_cellranger_arc": "output"
+      ],
+    )
     | build_cellranger_reference.run(
       runIf: { id, state ->
         state.target.contains("cellranger")
@@ -72,7 +87,8 @@ workflow run_wf {
       "output_gtf",
       "output_cellranger",
       "output_star",
-      "output_bd_rhapsody"
+      "output_bd_rhapsody",
+      "output_cellranger_arc",
     ])
   emit:
   output_ch