config.yaml

%YAML 1.1
---
###################################################################
####                 _______   _                    _____      ####
####         /\     |__   __| | |          /\      / ____|     ####
####        /  \       | |    | |         /  \    | (___       ####
####       / /\ \      | |    | |        / /\ \    \___ \      ####
####      / ____ \     | |    | |____   / ____ \   ____) |     ####
####     /_/    \_\    |_|    |______| /_/    \_\ |_____/      ####
####                                                           ####
###################################################################

#  For more details about the config values see:
#  https://metagenome-atlas.rtfd.io


########################
# Execution parameters
########################
# threads and memory (GB) for most jobs especially from BBtools, which are memory demanding
threads: 8
mem: 60

# threads and memory for jobs needing high amount of memory. e.g GTDB-tk,checkm or assembly
large_mem: 100
large_threads: 8
assembly_threads: 8
assembly_memory: 250

#Runtime only for cluster execution
runtime: #in h
  default: 5
  assembly: 48
  long: 24

# Local directory for temp files, useful for cluster execution without shared file system
tmpdir: /tmp
# directory where databases are downloaded with 'atlas download'
database_dir: /srv/beegfs/scratch/users/k/kiesers/Atlas/databases

########################
# Quality control
########################
data_type: metagenome # metagenome or metatranscriptome
interleaved_fastqs: false

# remove (PCR)-duplicated reads using clumpify
deduplicate: true
duplicates_only_optical: false
duplicates_allow_substitutions: 2

# used to trim adapters from reads and read ends
preprocess_adapters: /srv/beegfs/scratch/users/k/kiesers/Atlas/databases/adapters.fa
preprocess_minimum_base_quality: 10
preprocess_minimum_passing_read_length: 51
# 0.05 requires at least 5 percent of each nucleotide per sequence
preprocess_minimum_base_frequency: 0.05
preprocess_adapter_min_k: 8
preprocess_allowable_kmer_mismatches: 1
preprocess_reference_kmer_match_length: 27
# error correction where PE reads overlap
error_correction_overlapping_pairs: true
#contamination references can be added such that -- key: /path/to/fasta
contaminant_references:
  PhiX: /srv/beegfs/scratch/users/k/kiesers/Atlas/databases/phiX174_virus.fa
contaminant_max_indel: 20
contaminant_min_ratio: 0.65
contaminant_kmer_length: 13
contaminant_minimum_hits: 1
contaminant_ambiguous: best


########################
# Pre-assembly-processing
########################

error_correction_before_assembly: true

# join R1 and R2 at overlap; unjoined reads are still utilized
merge_pairs_before_assembly: true
merging_k: 62
# extend reads while merging to this many nucleotides
merging_extend2: 40
# Iterations are performed until extend2 x iterations
merging_flags: ecct iterations=5

########################
# Assembly
########################
# megahit OR spades
assembler: spades

# Megahit
#-----------
# 2 is for metagenomes, 3 for genomes with 30x coverage
megahit_min_count: 2
megahit_k_min: 21
megahit_k_max: 121
megahit_k_step: 20
megahit_merge_level: 20,0.98
megahit_prune_level: 2
megahit_low_local_ratio: 0.2
# ['default','meta-large','meta-sensitive']
megahit_preset: default

# Spades
#------------
spades_skip_BayesHammer: true
spades_use_scaffolds: false # use contigs
#Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order).
spades_k: auto
spades_preset: meta    # meta, ,normal, rna  single end libraries doesn't work for metaspades
spades_extra: ''
longread_type: none # [none,"pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"]
# Preprocessed long reads can be defined in the sample table with 'longreads' , for more info see the spades manual

# Filtering
#------------
# filter out assembled noise
# this is more important for assemblys from megahit
filter_contigs: true
prefilter_minimum_contig_length: 200
# trim contig tips
contig_trim_bp: 0
# require contigs to have read support
minimum_average_coverage: 1
minimum_percent_covered_bases: 20
minimum_mapped_reads: 0
# after filtering
minimum_contig_length: 300


########################
# Quantification
########################

# Mapping reads to contigs
#--------------------------
contig_min_id: 0.9
contig_map_paired_only: true
contig_max_distance_between_pairs: 1000
maximum_counted_map_sites: 10

########################
# Binning
########################

final_binner: DASTool             # [DASTool or one of the binner, e.g. maxbin]

binner:                           # If DASTool is used as final_binner, use predictions of this binners
- metabat
- maxbin


metabat:
  sensitivity: sensitive
  min_contig_length: 1500 # metabat needs >1500

maxbin:
  max_iteration: 50
  prob_threshold: 0.9
  min_contig_length: 1000

DASTool:
  search_engine: diamond
  score_threshold: 0.5              #Score threshold until selection algorithm will keep selecting bins [0..1].

genome_dereplication:
  ANI: 0.95
  overlap: 0.6
  opt_parameters: ''
  filter:
    noFilter: false
    length: 5000
    completeness: 50
    contamination: 10
  score:
    completeness: 1
    contamination: 5
    N50: 0.5
    length: 0

rename_mags_contigs: true        #Rename contigs of representative MAGs

########################
# Annotations
#######################

annotations:
- gtdb_tree
- gtdb_taxonomy
- genes
#  - checkm_taxonomy
#  - checkm_tree

########################
# Gene catalog
#######################
genecatalog:
  source: contigs               # [contigs, genomes] Predict genes from all contigs or only from the representative genomes
  clustermethod: linclust       # [cd-hit-est or mmseqs or linclust] see mmseqs for more details
  minlength_nt: 100
  minid: 0.95                   # min id for gene clustering for the main gene catalog used for annotation
  coverage: 0.9
  extra: ''
  SubsetSize: 500000


eggNOG_use_virtual_disk: true