forked from aomlomics/tourmaline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
248 lines (192 loc) · 10.3 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# config.yaml - configuration file for the Tourmaline Snakemake workflow
# Compatable with qiime2-2023.5
# User MUST edit these parameters before running their own data.
# Detailed instructions: https://github.com/aomlomics/tourmaline/wiki.
# METADATA FILE
# Metadata file must be named as follows (or use symbolic link):
# - 00-data/metadata.tsv
# Standardization is recommended through use of MIxS/MIMARKS (https://gensc.org/mixs/) column headers, for example:
# - submitted_to_insdc: {boolean}
# - investigation_type: [eukaryote|bacteria_archaea|plasmid|virus|organelle|metagenome|metatranscriptome|mimarks-survey|mimarks-specimen|misag|mimag|miuvig]
# - project_name: {text}
# - experimental_factor (text or EFO and/or OBI): {termLabel} {[termID]}|{text}
# - lat_lon (decimal degrees): {float} {float}
# FASTQ DATA (choose one option)
# Option 1 - if you want to start from manifest files and import per-sample fastq sequences,
# name your manifest file(s) as follows (or use symbolic links):
# - paired-end sequences: 00-data/manifest_pe.csv
# - single-end sequences: 00-data/manifest_se.csv
# Option 2 - if your fastq sequences are already archived in qza format,
# name your qza file(s) as follows (or use symbolic links):
# - paired-end sequences: 01-imported/fastq_pe.qza
# - single-end sequences: 01-imported/fastq_se.qza
# REFERENCE DATABASE (choose one option)
# Option 1 - if your reference database is not yet imported,
# name your reference fna and tsv files as follows (or use symbolic links):
# - reference sequences: 00-data/refseqs.fna
# - reference taxonomy: 00-data/reftax.tsv
# Option 2 - if your reference database is already archived in qza format,
# name your your qza files as follows (or use symbolic links):
# - reference sequences: 01-imported/refseqs.qza
# - reference taxonomy: 01-imported/reftax.qza
# Option 3 - if you will use a pre-build taxonomic classifer in qza format,
# no refseqs or reftax files are required in 00-data or 01-imported.
# Rather, put the classifier in the following location:
# - classifier: 01-imported/classifier.qza
# Provide a descriptive name for the reference database used, no spaces (e.g., original file name, source and version).
# Note that changing database name here is just for documentation purposes and has no effect on which file is used,
# which is determined by the file symbolically liked according to Option 1, 2 or 3 above.
database_name: silva-138-99-515-806_q2-2021.2
# DENOISE
# DADA2 PAIRED-END
# For more info run: qiime dada2 denoise-paired --help
# The trunc and trim values refer to absolute nucleotide positions so the order they are applied in does not matter.
dada2pe_trunc_len_f: 240
dada2pe_trunc_len_r: 190
dada2pe_trim_left_f: 0
dada2pe_trim_left_r: 0
dada2pe_max_ee_f: 2
dada2pe_max_ee_r: 2
dada2pe_trunc_q: 2
dada2pe_pooling_method: independent
dada2pe_chimera_method: consensus
dada2pe_min_fold_parent_over_abundance: 1
dada2pe_n_reads_learn: 1000000
dada2pe_hashed_feature_ids: --p-hashed-feature-ids
# DADA2 SINGLE-END
# For more info run: qiime dada2 denoise-single --help
# The trunc and trim values refer to absolute nucleotide positions so the order they are applied in does not matter.
dada2se_trunc_len: 240
dada2se_trim_left: 0
dada2se_max_ee: 2
dada2se_trunc_q: 2
dada2se_pooling_method: independent
dada2se_chimera_method: consensus
dada2se_min_fold_parent_over_abundance: 1
dada2se_n_reads_learn: 1000000
dada2se_hashed_feature_ids: --p-hashed-feature-ids
# DEBLUR SINGLE-END
# For more info run: qiime deblur denoise-other --help
deblur_trim_length: 240
deblur_sample_stats: --p-sample-stats
deblur_mean_error: 0.005
deblur_indel_prob: 0.01
deblur_indel_max: 3
deblur_min_reads: 10
deblur_min_size: 2
deblur_hashed_feature_ids: --p-hashed-feature-ids
# TAXONOMIC CLASSIFICATION
# Taxonomic classification of the representative sequences.
# - method: choose from: naive-bayes, consensus-blast, consensus-vsearch
# - parameters: add arbitrary parameters or at least one parameter; see "qiime feature-classifier COMMAND --help"
# For more info run: qiime feature-classifier [fit-classifier-naive-bayes|classify-sklearn|classify-consensus-blast|classify-consensus-vsearch] --help
classify_method: consensus-vsearch
classify_parameters: --verbose
# Table of feature counts per sample collapsed by chosen taxonomic level.
# taxa_level - taxonomic level at which the features should be collapsed. Default 7
# For more info run: qiime taxa collapse --help
classify_taxalevel: 7
# MULTIPLE SEQUENCE ALIGNMENT
# Multiple sequence alignment of the representative sequences.
# - method: choose from: muscle v5, clustalo, mafft
# - muscle_iters: number of refinement iterations (integer, default 100)
# For more info run: muscle, clustalo --help, qiime alignment [mafft|mask] --help
alignment_method: muscle
alignment_muscle_iters: 50
# OUTLIER DETECTION
# Representative sequence outlier detection using odseq.
# - distance_metric: choose from: linear, affine
# - bootstrap_replicates: number of bootstrap replicates (integer)
# - threshold: probability to be at right of the bootstrap scores distribution (float)
# For more info see: https://www.bioconductor.org/packages/release/bioc/html/odseq.html
odseq_distance_metric: linear
odseq_bootstrap_replicates: 100
odseq_threshold: 0.025
# SUBSAMPLING (RAREFACTION)
# Subsample (rarefy) data to have an even number of observations per sample.
# - core_sampling_depth: subsampling depth for core diversity analyses
# - alpha_max_depth: maximum subsampling depth for alpha diversity rarefaction analysis
core_sampling_depth: 500
alpha_max_depth: 500
# BETA GROUP SIGNIFICANCE
# Statistical test for difference between samples grouped by a metadata variable (column).
# - column: metadata column to test beta group significance; this column must appear in the metadata file 00-data/metadata.tsv
# - method: choose from: permanova, anosim, permdisp
# - pairwise: choose from: --p-no-pairwise, --p-pairwise (can be very slow)
beta_group_column: region
beta_group_method: permanova
beta_group_pairwise: --p-pairwise
# DEICODE BETA DIVERSITY
# Robust Aitchison PCA (and biplot ordination) with automatically estimated underlying low-rank structure.
# Parameters are set to recommended defaults:
# - min_sample_count: minimum sum cutoff of samples across all features
# - min_feature_count: minimum sum cufoff of features across all samples
# - min_feature_frequency: minimum percentage of samples a feature must appear with value greater than zero
# - max_iterations: number of iterations to optimize the solution
# - num_features: number of most important features (arrows) to display in the biplot ordination
# For more info run: qiime deicode auto-rpca --help
deicode_min_sample_count: 500
deicode_min_feature_count: 10
deicode_min_feature_frequency: 0
deicode_max_iterations: 5
deicode_num_features: 5
# REPORT THEME
# Report theme for html version.
# Choose from: github, gothic, newsprint, night, pixyll, whitey.
report_theme: github
# FILTERING
# Filtering is implemented by filtering commands only, eg "snakemake dada2_pe_report_filtered", and is applied to each of
# representative sequences, taxonomy, and feature table.
# FILTER SAMPLES BY ID
# The file of sample IDs to be removed must have the sample IDs in the 1st column with a header line. Any number of additional columns can be added.
# The file must be named 00-data/samples_to_filter_{method}.tsv, where method is dada2-pe, dada2-se, or deblur-se.
# For more info run: qiime feature-table filter-samples --help
# TO SKIP FILTERING BY SAMPLE ID: provide a file with only headers (default) or don't run filtering commands.
# FILTER SAMPLES BY METADATA
# Samples can be removed or retained based on their metadata, using SQLite WHERE-clause syntax inside double quotes.
# For more info run: qiime feature-table filter-samples --help
# TO SKIP FILTERING BY SAMPLE METADATA: provide "none" (default) or don't run filtering commands.
# EXAMPLE: "[region]='Open Water'"
metadata_filter: none
# FILTER BY FEATURE ID
# The file of feature IDs to be filtered must have a header line with two columns: 1. "featureid", 2. anything.
# The file must be named 00-data/repseqs_to_filter_{method}.tsv, where method is dada2-pe, dada2-se, or deblur-se.
# If filtering outliers, just copy 02-output-{method}-{filter}/02-alignment-tree/repseqs_to_filter_outliers.tsv to the above filename.
# Add any additional feature IDs to be filtered (no duplicates allowed).
# For more info run: qiime feature-table [filter-seqs|filter-features] --help
# TO SKIP FILTERING BY FEATURE ID: provide only nonsense feature IDs in the above files (default) or don't run filtering commands.
# FILTER BY TAXONOMY
# Features with taxonomy containing these terms will be filtered.
# Separate terms with commas (e.g., mitochondria,chloroplast,eukaryota,unassigned).
# Terms are not case-sensitive.
# For more info run: qiime taxa filter-seqs --help
# TO SKIP FILTERING BY TAXONOMY: provide a nonsense term or don't run filtering commands.
exclude_terms: eukaryota,archaea,mitochondria,chloroplast,unassigned
# FILTER BY LENGTH
# Set minimum and maximum sequence lengths to filter representative sequences by.
# Limits are inclusive, ie, greater than or equal to minimum, less than or equal to maximum.
# For more info run: qiime feature-table filter-seqs --help
# TO SKIP FILTERING BY LENGTH: set values to extreme values, eg (0, 10000) or don't run filtering commands.
repseq_min_length: 0
repseq_max_length: 260
# FILTER BY ABUNDANCE & PREVALENCE
# Set minimum abundance/prevalence limits for filtering.
# Values are floats range(0,1)
# Limit is inclusive, ie, greater than or equal to minimum. Samples with frequency of 0 after filtering will also be removed.
# For more info run: qiime feature-table filter-features-conditionally --help
# TO SKIP FILTERING BY ABUNDANCE/PREVALENCE: set values to 0 or don't run filtering commands.
repseq_min_abundance: 0.01
repseq_min_prevalence: 0.1
# THREADS
# Max number of threads for individual rules.
# Threads used will be the lower of this and snakemake parameter --cores.
# Parameter other_threads is used for all other rules, regardless if they can use multiple threads,
# because it prevents multiple rules from running simultaneously with --cores >1.
dada2pe_threads: 8
dada2se_threads: 8
deblur_threads: 8
alignment_threads: 8
feature_classifier_threads: 8
phylogeny_fasttree_threads: 8
diversity_core_metrics_phylogenetic_threads: 8
other_threads: 8