Skip to content

Commit

Permalink
Add CellRanger ATAC count (#762)
Browse files Browse the repository at this point in the history
  • Loading branch information
VladimirShitov authored Aug 23, 2024
1 parent 8ab10e4 commit ab45cd1
Show file tree
Hide file tree
Showing 6 changed files with 257 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@

## NEW FUNCTIONALITY

* Added `demux/cellranger_atac_mkfastq` component: demultiplex raw sequencing data for ATAC experiments (PR #726).

* `process_samples`, `process_batches` and `rna_multisample` workflows: added functionality to scale the log-normalized
gene expression data to unit variance and zero mean. The scaled data will be output to a different layer and the
representation with reduced dimensions will be created and stored in addition to the non-scaled data (PR #733).
Expand Down Expand Up @@ -85,6 +87,8 @@

## MINOR CHANGES

* `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726).

* `neighbors/find_neighbors` component: Modified to include results of KNN in the output file (PR #748).
2 new optional arguments added to set .obsm slots to save KNN results into:
- `obsm_knn_indices`
Expand Down
13 changes: 12 additions & 1 deletion resources_test_scripts/cellranger_atac_tiny_bcl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ function clean_up {
}
trap clean_up EXIT

viash ns build -q "download_file|cellranger_atac_mkfastq" -p docker --setup cb
viash ns build -q "download_file|cellranger_atac_mkfastq|build_cellranger_arc_reference|cellranger_atac_count" -p docker --setup cb


# download bcl data
if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
Expand Down Expand Up @@ -61,3 +62,13 @@ if [ ! -d "${OUT}/fastqs" ]; then
--csv "${OUT}/bcl/layout.csv" \
--output "${OUT}/fastqs"
fi

# Create count matrices
if [ ! -d "${OUT}/counts" ]; then
mkdir -p "$OUT/counts"

target/docker/mapping/cellranger_atac_count/cellranger_atac_count \
--input "${OUT}/fastqs/HJN3KBCX2/test_sample/" \
--reference "${REFERENCE_DIR}/reference_cellranger.tar.gz" \
--output "${OUT}/counts"
fi
3 changes: 2 additions & 1 deletion resources_test_scripts/scgpt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,5 @@ viash run src/scgpt/embedding/config.vsh.yaml -p docker -- \
echo "> Removing unnecessary files in test resources dir"
find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete

echo "> scGPT test resources are ready!"
echo "> scGPT test resources are ready!"

76 changes: 76 additions & 0 deletions src/mapping/cellranger_atac_count/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
functionality:
name: cellranger_atac_count
namespace: mapping
description: Align fastq files using Cell Ranger ATAC count.
authors:
- __merge__: /src/authors/vladimir_shitov.yaml
roles: [ author ]
argument_groups:
- name: Inputs
arguments:
- type: file
name: --input
required: true
multiple: true
example: [ "sample_S1_L001_R1_001.fastq.gz", "sample_S1_L001_R2_001.fastq.gz" ]
description: The fastq.gz files to align. Can also be a single directory containing fastq.gz files.
- type: file
name: --reference
required: true
description: The path to Cell Ranger reference tar.gz file. Can also be a directory.
example: reference.tar.gz
- name: Outputs
arguments:
- type: file
name: --output
direction: output
description: The folder to store the alignment results.
example: "/path/to/output"
required: true
- name: Arguments
arguments:
- type: string
name: --description
description: Sample description to embed in output files
default: ""
- type: integer
name: --force_cells
description: "Define the top N barcodes with the most fragments overlapping peaks as cells and override the cell calling algorithm. N must be a positive integer <= 20,000. Use this option if the number of cells estimated by Cell Ranger ATAC is not consistent with the barcode rank plot"
max: 20000
- type: file
name: --peaks
description: "Override peak caller: specify peaks to use in downstream analyses from supplied 3-column BED file. The supplied peaks file must be sorted by position and not contain overlapping peaks; comment lines beginning with # are allowed"
- type: string
name: --dim_reduce
description: "Dimensionality reduction mode for clustering"
choices: [ lsa, pca, plsa ]
default: lsa
- type: double
name: --subsample_rate
description: "Downsample to preserve this fraction of reads"
example: 0.1
- type: string
multiple: true
name: --lanes
description: bcl2fastq option. Semicolon-delimited series of lanes to demultiplex. Use this if you have a sample sheet for an entire flow cell but only want to generate a few lanes for further 10x Genomics analysis.
example: 1,3
resources:
- type: bash_script
path: script.sh
test_resources:
- type: python_script
path: test.py
- path: /resources_test/cellranger_atac_tiny_bcl
- path: /src/utils/setup_logger.py
- path: /resources_test/reference_gencodev41_chr1
platforms:
- type: docker
image: ghcr.io/data-intuitive/cellranger_atac:2.1
setup:
- type: docker
run: |
DEBIAN_FRONTEND=noninteractive apt update \
&& apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/*
- type: nextflow
directives:
label: [ highmem, highcpu ]
80 changes: 80 additions & 0 deletions src/mapping/cellranger_atac_count/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/bin/bash

set -eo pipefail

## VIASH START
par_input='resources_test/cellranger_atac_tiny_bcl/fastqs/'
par_reference='resources_test/reference_gencodev41_chr1/'
par_output='resources_test/cellranger_atac_tiny_bcl/bam'
## VIASH END

# just to make sure paths are absolute
par_reference=`realpath $par_reference`
par_output=`realpath $par_output`

echo "Creating temporary directory"
tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX")
function clean_up {
rm -rf "$tmpdir"
}
trap clean_up EXIT

# process inputs
# for every fastq file found, make a symlink into the tempdir
echo "Locating fastqs"
fastq_dir="$tmpdir/fastqs"
mkdir -p "$fastq_dir"
IFS=";"
for var in $par_input; do
unset IFS
abs_path=`realpath $var`
if [ -d "$abs_path" ]; then
find "$abs_path" -name *.fastq.gz -exec ln -s {} "$fastq_dir" \;
else
ln -s "$abs_path" "$fastq_dir"
fi
done

echo "fastq_dir content: $(ls $fastq_dir)"

echo "Processing reference"
# process reference
if file $par_reference | grep -q 'gzip compressed data'; then
echo "Untarring genome"
reference_dir="$tmpdir/fastqs"
mkdir -p "$reference_dir"
tar -xvf "$par_reference" -C "$reference_dir" --strip-components=1
par_reference="$reference_dir"
fi

# cd into tempdir
cd "$tmpdir"

if [ ! -z "$meta_memory_gb" ]; then
# always keep 2gb for the OS itself
memory_gb=`python -c "print(int('$meta_memory_gb') - 2)"`
fi

echo "Running cellranger-atac count"

id=myoutput
cellranger-atac count \
--id "$id" \
--fastqs "$fastq_dir" \
--reference "$par_reference" \
--dim-reduce "$par_dim_reduce" \
--description "$par_description" \
${par_lanes:+--lanes=${par_lanes[*]}} \
${par_force_cells:+--force-cells=$par_force_cells} \
${par_subsample_rate:+--subsample-rate=$par_subsample_rate} \
${memory_gb:+--localmem=$memory_gb} \
${meta_cpus:+--localcores=$meta_cpus} \
${par_lanes:+--lanes=${par_lanes[*]}}

echo "Copying output"
if [ -d "$id/outs/" ]; then
if [ ! -d "$par_output" ]; then
mkdir -p "$par_output"
fi
mv "$id/outs/"* "$par_output"
fi
83 changes: 83 additions & 0 deletions src/mapping/cellranger_atac_count/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import subprocess
from os import path
import sys
from itertools import zip_longest, chain

## VIASH START
meta = {
"functionality_name": "cellranger_atac_count",
"resources_dir": "resources_test"
}
## VIASH END

sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)

return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()

logger.info("> Running command with folder")
input = meta["resources_dir"] + "/cellranger_atac_tiny_bcl/fastqs/HJN3KBCX2/test_sample/"
reference = meta["resources_dir"] + "/reference_gencodev41_chr1/reference_cellranger.tar.gz"
output = "test_output"
cmd_pars = [
meta["executable"],
"--input", input,
"--reference", reference,
"--output", output
]
if meta.get("cpus"):
cmd_pars.extend(["---cpus", str(meta["cpus"])])
if meta.get("memory_gb"):
cmd_pars.extend(["---memory", f"{meta['memory_gb']}gb"])
try:
out = subprocess.check_output(cmd_pars).decode("utf-8")
except subprocess.CalledProcessError as e:
logger.error(e.output)
raise e
logger.info("> Check if file exists")
assert path.exists(output + "/filtered_peak_bc_matrix.h5"), "No output was created."
assert path.exists(output + "/fragments.tsv.gz"), "No fragments file was created."


logger.info("> Running command with fastq files")
# test_sample_S1_L001_R2_001.fastq.gz test_sample_S1_L001_R1_001.fastq.gz test_sample_S1_L001_R3_001.fastq.gz
input_files = [
input + "test_sample_S1_L001_I1_001.fastq.gz",
input + "test_sample_S1_L001_R1_001.fastq.gz",
input + "test_sample_S1_L001_R2_001.fastq.gz",
input + "test_sample_S1_L001_R3_001.fastq.gz",
]
output = "test_output2"


cmd_pars = [
meta["executable"],
*chain.from_iterable([("--input", input_file) for input_file in input_files]),
"--reference", reference,
"--output", output
]
if meta.get("cpus"):
cmd_pars.extend(["---cpus", str(meta["cpus"])])
if meta.get("memory_gb"):
cmd_pars.extend(["---memory", f"{meta['memory_gb']}gb"])
out = subprocess.check_output(cmd_pars).decode("utf-8")

logger.info("> Check if file exists")
assert path.exists(output + "/filtered_peak_bc_matrix.h5"), "No output was created."
assert path.exists(output + "/fragments.tsv.gz"), "No fragments file was created."

logger.info("> Completed Successfully!")

0 comments on commit ab45cd1

Please sign in to comment.