-
Notifications
You must be signed in to change notification settings - Fork 1
/
config.yaml
34 lines (31 loc) · 1.87 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
verbose: True
seed: 94305 #random seed
simulation:
run: True # whether or not to run simulation (can be skipped if previously done)
path: # where to store the simulated data, default is <output_folder>/generated_data/ if run is False this is where the simulation data will be sought
splits:
ratios: # how to split the data for training the model
train1: 0.8
train2: 0.15
val: 0.05 # set to 0 in order to not perform any validation after training the model
gens:
train1: [0, 2, 4, 6, 8, 12, 16, 24]
train2: [0, 2, 4, 6, 8, 12, 16, 24]
val: [2, 4, 6, 8, 12, 16, 24]
gens: [0, 2, 4, 6, 8, 12, 16, 24] # which generations to simulate, ...
# ... not critical some accuracy can be squeezed if it better represents the query data
r_admixed: 0.6 # set it lower if memory is an issue
rm_data: False # removes simulated data once it has been used (to reduce disk space usage)
model:
name: model # model name for storing model + analysis
inference: # If blank, we use default (best, fast, large are alternative options)
window_size_cM: 0.2 # size of each base window in centi Morgans, use larger windows if snp density is lower (for example for genotype data)
smooth_size: 75 # How many windows the smoothers see
context_ratio: 0.5 # context for each window will be context_ratio*window_size on each side
retrain_base: True # retrain the base models once smoother data has been obtained
calibrate: False # calibrate the output probabilities
n_cores: 8 # number of CPUs to use, default is maximum
inference:
bed_file_output: False # generate files for each individual that show the run length encoding of their ancestry segments
snp_level_inference: False # output ancestry inference for each marker of the query file
visualize_inference: False # visualize chromosome paintings, beta version