-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
209 lines (175 loc) · 5.87 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
%YAML 1.1
---
###################################################################
#### _______ _ _____ ####
#### /\ |__ __| | | /\ / ____| ####
#### / \ | | | | / \ | (___ ####
#### / /\ \ | | | | / /\ \ \___ \ ####
#### / ____ \ | | | |____ / ____ \ ____) | ####
#### /_/ \_\ |_| |______| /_/ \_\ |_____/ ####
#### ####
###################################################################
# For more details about the config values see:
# https://metagenome-atlas.rtfd.io
########################
# Execution parameters
########################
# threads and memory (GB) for most jobs especially from BBtools, which are memory demanding
threads: 8
mem: 60
# threads and memory for jobs needing high amount of memory. e.g GTDB-tk,checkm or assembly
large_mem: 100
large_threads: 8
assembly_threads: 8
assembly_memory: 250
#Runtime only for cluster execution
runtime: #in h
default: 5
assembly: 48
long: 24
# Local directory for temp files, useful for cluster execution without shared file system
tmpdir: /tmp
# directory where databases are downloaded with 'atlas download'
database_dir: /srv/beegfs/scratch/users/k/kiesers/Atlas/databases
########################
# Quality control
########################
data_type: metagenome # metagenome or metatranscriptome
interleaved_fastqs: false
# remove (PCR)-duplicated reads using clumpify
deduplicate: true
duplicates_only_optical: false
duplicates_allow_substitutions: 2
# used to trim adapters from reads and read ends
preprocess_adapters: /srv/beegfs/scratch/users/k/kiesers/Atlas/databases/adapters.fa
preprocess_minimum_base_quality: 10
preprocess_minimum_passing_read_length: 51
# 0.05 requires at least 5 percent of each nucleotide per sequence
preprocess_minimum_base_frequency: 0.05
preprocess_adapter_min_k: 8
preprocess_allowable_kmer_mismatches: 1
preprocess_reference_kmer_match_length: 27
# error correction where PE reads overlap
error_correction_overlapping_pairs: true
#contamination references can be added such that -- key: /path/to/fasta
contaminant_references:
PhiX: /srv/beegfs/scratch/users/k/kiesers/Atlas/databases/phiX174_virus.fa
contaminant_max_indel: 20
contaminant_min_ratio: 0.65
contaminant_kmer_length: 13
contaminant_minimum_hits: 1
contaminant_ambiguous: best
########################
# Pre-assembly-processing
########################
error_correction_before_assembly: true
# join R1 and R2 at overlap; unjoined reads are still utilized
merge_pairs_before_assembly: true
merging_k: 62
# extend reads while merging to this many nucleotides
merging_extend2: 40
# Iterations are performed until extend2 x iterations
merging_flags: ecct iterations=5
########################
# Assembly
########################
# megahit OR spades
assembler: spades
# Megahit
#-----------
# 2 is for metagenomes, 3 for genomes with 30x coverage
megahit_min_count: 2
megahit_k_min: 21
megahit_k_max: 121
megahit_k_step: 20
megahit_merge_level: 20,0.98
megahit_prune_level: 2
megahit_low_local_ratio: 0.2
# ['default','meta-large','meta-sensitive']
megahit_preset: default
# Spades
#------------
spades_skip_BayesHammer: true
spades_use_scaffolds: false # use contigs
#Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order).
spades_k: auto
spades_preset: meta # meta, ,normal, rna single end libraries doesn't work for metaspades
spades_extra: ''
longread_type: none # [none,"pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"]
# Preprocessed long reads can be defined in the sample table with 'longreads' , for more info see the spades manual
# Filtering
#------------
# filter out assembled noise
# this is more important for assemblys from megahit
filter_contigs: true
prefilter_minimum_contig_length: 200
# trim contig tips
contig_trim_bp: 0
# require contigs to have read support
minimum_average_coverage: 1
minimum_percent_covered_bases: 20
minimum_mapped_reads: 0
# after filtering
minimum_contig_length: 300
########################
# Quantification
########################
# Mapping reads to contigs
#--------------------------
contig_min_id: 0.9
contig_map_paired_only: true
contig_max_distance_between_pairs: 1000
maximum_counted_map_sites: 10
########################
# Binning
########################
final_binner: DASTool # [DASTool or one of the binner, e.g. maxbin]
binner: # If DASTool is used as final_binner, use predictions of this binners
- metabat
- maxbin
metabat:
sensitivity: sensitive
min_contig_length: 1500 # metabat needs >1500
maxbin:
max_iteration: 50
prob_threshold: 0.9
min_contig_length: 1000
DASTool:
search_engine: diamond
score_threshold: 0.5 #Score threshold until selection algorithm will keep selecting bins [0..1].
genome_dereplication:
ANI: 0.95
overlap: 0.6
opt_parameters: ''
filter:
noFilter: false
length: 5000
completeness: 50
contamination: 10
score:
completeness: 1
contamination: 5
N50: 0.5
length: 0
rename_mags_contigs: true #Rename contigs of representative MAGs
########################
# Annotations
#######################
annotations:
- gtdb_tree
- gtdb_taxonomy
- genes
# - checkm_taxonomy
# - checkm_tree
########################
# Gene catalog
#######################
genecatalog:
source: contigs # [contigs, genomes] Predict genes from all contigs or only from the representative genomes
clustermethod: linclust # [cd-hit-est or mmseqs or linclust] see mmseqs for more details
minlength_nt: 100
minid: 0.95 # min id for gene clustering for the main gene catalog used for annotation
coverage: 0.9
extra: ''
SubsetSize: 500000
eggNOG_use_virtual_disk: true