config.yaml

# config.yaml - configuration file for the Tourmaline Snakemake workflow
# Compatable with qiime2-2023.5
# User MUST edit these parameters before running their own data.
# Detailed instructions: https://github.com/aomlomics/tourmaline/wiki.

# METADATA FILE

# Metadata file must be named as follows (or use symbolic link):
# - 00-data/metadata.tsv

# Standardization is recommended through use of MIxS/MIMARKS (https://gensc.org/mixs/) column headers, for example:
# - submitted_to_insdc: {boolean}
# - investigation_type: [eukaryote|bacteria_archaea|plasmid|virus|organelle|metagenome|metatranscriptome|mimarks-survey|mimarks-specimen|misag|mimag|miuvig]
# - project_name: {text}
# - experimental_factor (text or EFO and/or OBI): {termLabel} {[termID]}|{text}
# - lat_lon (decimal degrees): {float} {float}

# FASTQ DATA (choose one option)

# Option 1 - if you want to start from manifest files and import per-sample fastq sequences,
# name your manifest file(s) as follows (or use symbolic links):
# - paired-end sequences: 00-data/manifest_pe.csv
# - single-end sequences: 00-data/manifest_se.csv

# Option 2 - if your fastq sequences are already archived in qza format,
# name your qza file(s) as follows (or use symbolic links):
# - paired-end sequences: 01-imported/fastq_pe.qza
# - single-end sequences: 01-imported/fastq_se.qza

# REFERENCE DATABASE (choose one option)

# Option 1 - if your reference database is not yet imported, 
# name your reference fna and tsv files as follows (or use symbolic links):
# - reference sequences: 00-data/refseqs.fna
# - reference taxonomy: 00-data/reftax.tsv

# Option 2 - if your reference database is already archived in qza format,
# name your your qza files as follows (or use symbolic links):
# - reference sequences: 01-imported/refseqs.qza
# - reference taxonomy: 01-imported/reftax.qza

# Option 3 - if you will use a pre-build taxonomic classifer in qza format,
# no refseqs or reftax files are required in 00-data or 01-imported.
# Rather, put the classifier in the following location:
# - classifier: 01-imported/classifier.qza

# Provide a descriptive name for the reference database used, no spaces (e.g., original file name, source and version).
# Note that changing database name here is just for documentation purposes and has no effect on which file is used, 
# which is determined by the file symbolically liked according to Option 1, 2 or 3 above.

database_name: silva-138-99-515-806_q2-2021.2

# DENOISE

# DADA2 PAIRED-END
# For more info run: qiime dada2 denoise-paired --help
# The trunc and trim values refer to absolute nucleotide positions so the order they are applied in does not matter.

dada2pe_trunc_len_f: 240
dada2pe_trunc_len_r: 190
dada2pe_trim_left_f: 0
dada2pe_trim_left_r: 0
dada2pe_max_ee_f: 2
dada2pe_max_ee_r: 2
dada2pe_trunc_q: 2
dada2pe_pooling_method: independent
dada2pe_chimera_method: consensus
dada2pe_min_fold_parent_over_abundance: 1
dada2pe_n_reads_learn: 1000000
dada2pe_hashed_feature_ids: --p-hashed-feature-ids

# DADA2 SINGLE-END
# For more info run: qiime dada2 denoise-single --help
# The trunc and trim values refer to absolute nucleotide positions so the order they are applied in does not matter.

dada2se_trunc_len: 240
dada2se_trim_left: 0
dada2se_max_ee: 2
dada2se_trunc_q: 2
dada2se_pooling_method: independent
dada2se_chimera_method: consensus
dada2se_min_fold_parent_over_abundance: 1
dada2se_n_reads_learn: 1000000
dada2se_hashed_feature_ids: --p-hashed-feature-ids

# DEBLUR SINGLE-END
# For more info run: qiime deblur denoise-other --help

deblur_trim_length: 240
deblur_sample_stats: --p-sample-stats
deblur_mean_error: 0.005
deblur_indel_prob: 0.01
deblur_indel_max: 3
deblur_min_reads: 10
deblur_min_size: 2
deblur_hashed_feature_ids: --p-hashed-feature-ids

# TAXONOMIC CLASSIFICATION

# Taxonomic classification of the representative sequences.
# - method: choose from: naive-bayes, consensus-blast, consensus-vsearch
# - parameters: add arbitrary parameters or at least one parameter; see "qiime feature-classifier COMMAND --help"
# For more info run: qiime feature-classifier [fit-classifier-naive-bayes|classify-sklearn|classify-consensus-blast|classify-consensus-vsearch] --help

classify_method: consensus-vsearch
classify_parameters: --verbose

# Table of feature counts per sample collapsed by chosen taxonomic level.
# taxa_level - taxonomic level at which the features should be collapsed. Default 7
# For more info run: qiime taxa collapse --help

classify_taxalevel: 7

# MULTIPLE SEQUENCE ALIGNMENT

# Multiple sequence alignment of the representative sequences.
# - method: choose from: muscle v5, clustalo, mafft
# - muscle_iters: number of refinement iterations (integer, default 100)
# For more info run: muscle, clustalo --help, qiime alignment [mafft|mask] --help

alignment_method: muscle
alignment_muscle_iters: 50

# OUTLIER DETECTION

# Representative sequence outlier detection using odseq.
# - distance_metric: choose from: linear, affine
# - bootstrap_replicates: number of bootstrap replicates (integer)
# - threshold: probability to be at right of the bootstrap scores distribution (float)
# For more info see: https://www.bioconductor.org/packages/release/bioc/html/odseq.html

odseq_distance_metric: linear
odseq_bootstrap_replicates: 100
odseq_threshold: 0.025

# SUBSAMPLING (RAREFACTION)

# Subsample (rarefy) data to have an even number of observations per sample.
# - core_sampling_depth: subsampling depth for core diversity analyses
# - alpha_max_depth: maximum subsampling depth for alpha diversity rarefaction analysis

core_sampling_depth: 500
alpha_max_depth: 500

# BETA GROUP SIGNIFICANCE

# Statistical test for difference between samples grouped by a metadata variable (column).
# - column: metadata column to test beta group significance; this column must appear in the metadata file 00-data/metadata.tsv
# - method: choose from: permanova, anosim, permdisp
# - pairwise: choose from: --p-no-pairwise, --p-pairwise (can be very slow)

beta_group_column: region
beta_group_method: permanova
beta_group_pairwise: --p-pairwise

# DEICODE BETA DIVERSITY

# Robust Aitchison PCA (and biplot ordination) with automatically estimated underlying low-rank structure.
# Parameters are set to recommended defaults:
# - min_sample_count: minimum sum cutoff of samples across all features
# - min_feature_count: minimum sum cufoff of features across all samples
# - min_feature_frequency: minimum percentage of samples a feature must appear with value greater than zero
# - max_iterations: number of iterations to optimize the solution
# - num_features: number of most important features (arrows) to display in the biplot ordination
# For more info run: qiime deicode auto-rpca --help

deicode_min_sample_count: 500
deicode_min_feature_count: 10
deicode_min_feature_frequency: 0
deicode_max_iterations: 5
deicode_num_features: 5

# REPORT THEME

# Report theme for html version.
# Choose from: github, gothic, newsprint, night, pixyll, whitey.

report_theme: github

# FILTERING

# Filtering is implemented by filtering commands only, eg "snakemake dada2_pe_report_filtered", and is applied to each of
# representative sequences, taxonomy, and feature table.

# FILTER SAMPLES BY ID
# The file of sample IDs to be removed must have the sample IDs in the 1st column with a header line. Any number of additional columns can be added.
# The file must be named 00-data/samples_to_filter_{method}.tsv, where method is dada2-pe, dada2-se, or deblur-se.
# For more info run: qiime feature-table filter-samples --help
# TO SKIP FILTERING BY SAMPLE ID: provide a file with only headers (default) or don't run filtering commands.

# FILTER SAMPLES BY METADATA
# Samples can be removed or retained based on their metadata, using SQLite WHERE-clause syntax inside double quotes.
# For more info run: qiime feature-table filter-samples --help
# TO SKIP FILTERING BY SAMPLE METADATA: provide "none" (default) or don't run filtering commands.
# EXAMPLE: "[region]='Open Water'"
metadata_filter: none

# FILTER BY FEATURE ID
# The file of feature IDs to be filtered must have a header line with two columns: 1. "featureid", 2. anything.
# The file must be named 00-data/repseqs_to_filter_{method}.tsv, where method is dada2-pe, dada2-se, or deblur-se.
# If filtering outliers, just copy 02-output-{method}-{filter}/02-alignment-tree/repseqs_to_filter_outliers.tsv to the above filename.
# Add any additional feature IDs to be filtered (no duplicates allowed).
# For more info run: qiime feature-table [filter-seqs|filter-features] --help
# TO SKIP FILTERING BY FEATURE ID: provide only nonsense feature IDs in the above files (default) or don't run filtering commands.

# FILTER BY TAXONOMY
# Features with taxonomy containing these terms will be filtered.
# Separate terms with commas (e.g., mitochondria,chloroplast,eukaryota,unassigned).
# Terms are not case-sensitive.
# For more info run: qiime taxa filter-seqs --help
# TO SKIP FILTERING BY TAXONOMY: provide a nonsense term or don't run filtering commands.

exclude_terms: eukaryota,archaea,mitochondria,chloroplast,unassigned

# FILTER BY LENGTH
# Set minimum and maximum sequence lengths to filter representative sequences by.
# Limits are inclusive, ie, greater than or equal to minimum, less than or equal to maximum.
# For more info run: qiime feature-table filter-seqs --help
# TO SKIP FILTERING BY LENGTH: set values to extreme values, eg (0, 10000) or don't run filtering commands.

repseq_min_length: 0
repseq_max_length: 260

# FILTER BY ABUNDANCE & PREVALENCE
# Set minimum abundance/prevalence limits for filtering.
# Values are floats range(0,1)
# Limit is inclusive, ie, greater than or equal to minimum. Samples with frequency of 0 after filtering will also be removed.
# For more info run: qiime feature-table filter-features-conditionally --help
# TO SKIP FILTERING BY ABUNDANCE/PREVALENCE: set values to 0 or don't run filtering commands.

repseq_min_abundance: 0.01
repseq_min_prevalence: 0.1

# THREADS

# Max number of threads for individual rules.
# Threads used will be the lower of this and snakemake parameter --cores.
# Parameter other_threads is used for all other rules, regardless if they can use multiple threads, 
# because it prevents multiple rules from running simultaneously with --cores >1.

dada2pe_threads: 8
dada2se_threads: 8
deblur_threads: 8
alignment_threads: 8
feature_classifier_threads: 8
phylogeny_fasttree_threads: 8
diversity_core_metrics_phylogenetic_threads: 8
other_threads: 8