-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathSnakefile
342 lines (309 loc) · 12.9 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
from os.path import join
# configuration file
if 'config' not in vars() or not config or 'ntrees' not in config:
configfile: "config.yaml"
# output directory
output_dir=config["output_dir"]
# MAF file containing mutations
mutations=config["mutations"]
# pre-trained classifier
trained_classifier=config["trained_classifier"]
# flag for CV
cv="--cv"
# number of trees in RF
ntrees=config['ntrees']
ntrees2=5*ntrees
# indication of whether to drop silent mutations
drop_silent="no"
if "drop_silent" in config:
drop_silent = config['drop_silent']
if drop_silent == "yes":
drop_silent_flag = "--drop-silent"
else:
drop_silent_flag = ""
# params for simulations
num_iter=10
ids=list(map(str, range(1, num_iter+1)))
# minimum recurrent missense
min_recur=3
###################################
# Top-level rules
###################################
rule all:
input: join(output_dir, "output/results/r_random_forest_prediction.txt")
# same rule is "all", but semantically more meaningful
rule predict:
"""
Predict on a pan-cancer set of somatic mutations from multiple cancer types.
This command will simultaneous train 20/20+ and make predictions using
gene hold-out cross-validation. The predict command uses the following parameters:
Input
-----
mutations : MAF file
MAF file containing mutations. Please see http://probabilistic2020.readthedocs.io/en/latest/tutorial.html#mutations for details on file format.
Output
------
output_dir : directory
Path of directory to save output. The results are save in the
"output/results/r_random_forest_prediction.txt" file.
"""
input: join(output_dir, "output/results/r_random_forest_prediction.txt")
# top-level rule to only train the 20/20+ random forest
rule train:
"""
Train a 20/20+ model to predict cancer driver genes. The trained model can
be used for subsequent prediction. The train command uses the following parameters:
Input
-----
mutations : MAF file
MAF file containing mutations. Please see http://probabilistic2020.readthedocs.io/en/latest/tutorial.html#mutations for details on file format.
Output
------
output_dir : directory
Path to file directory to save output. The saved model file from
20/20+ will be named 2020plus.Rdata by default.
"""
input: join(output_dir, "2020plus.Rdata")
# use an already trained 20/20+ random forest to predict new data
rule pretrained_predict:
"""
Predict cancer driver genes using a pre-trained 20/20+ model from the "train" command. The pretrained_predict command uses the following parameters:
Input
-----
mutations : MAF file
MAF file containing mutations. Please see http://probabilistic2020.readthedocs.io/en/latest/tutorial.html#mutations for details on file format.
trained_classifier : .Rdata file
File path of saved R workspace containing the trained 20/20+ model.
Output
------
output_dir : directory
File path of directory to save output. The results are save in the
"pretrained_output/results/r_random_forest_prediction.txt" file.
"""
input: join(output_dir, "pretrained_output/results/r_random_forest_prediction.txt")
rule help:
"""
Print list of all targets with help.
"""
run:
print('Input and output parameters are specified via the command line or in the config.yaml file. If done via the command line, e.g., the "trained_classifier" option would be specified by the following argument:\n\n--config trained_classifier="data/2020plus_100k.Rdata"\n\nMultiple options can follow after the --config flag.\n')
myhelp = ['predict', 'train', 'pretrained_predict', 'help']
for myrule in workflow.rules:
if myrule.name in myhelp:
print('='*len(myrule.name))
print(myrule.name)
print('='*len(myrule.name))
print(myrule.docstring)
print('See "snakemake --help" for additional snakemake command line help documentation.\n')
###################################
# Code for calculating empirical null
# distribution based on simulations
###################################
# Simulate MAF files for subsequent running by oncogene/tsg test
rule simMaf:
input:
MUTATIONS=mutations
params:
min_recur=min_recur,
data_dir=config["data_dir"],
dropsilent=drop_silent_flag
output:
join(output_dir, "simulated_summary/chasm_sim_maf{iter,[0-9]+}.txt")
shell:
"mut_annotate --log-level=INFO {params.dropsilent} "
" -b {params.data_dir}/snvboxGenes.bed -i {params.data_dir}/snvboxGenes.fa -c 1.5 "
" -m {input.MUTATIONS} -p 0 -n 1 --maf --seed=$(({wildcards.iter}*42)) "
" -r {params.min_recur} --unique -o {output}"
# calculate summarized features for the simulated mutations
rule simSummary:
input:
MUTATIONS=mutations
params:
min_recur=min_recur,
data_dir=config["data_dir"],
dropsilent=drop_silent_flag
output:
join(output_dir, "simulated_summary/chasm_sim_summary{iter}.txt")
shell:
"mut_annotate --log-level=INFO {params.dropsilent} "
" -b {params.data_dir}/snvboxGenes.bed -i {params.data_dir}/snvboxGenes.fa "
" -c 1.5 -m {input.MUTATIONS} -p 0 -n 1 --summary --seed=$(({wildcards.iter}*42)) "
" --score-dir={params.data_dir}/scores "
" --unique -r {params.min_recur} -o {output}"
# run probabilistic2020 tsg statistical test on simulated MAF
rule simTsg:
input:
join(output_dir, "simulated_summary/chasm_sim_maf{iter}.txt")
params:
num_sim=config["NUMSIMULATIONS"],
data_dir=config["data_dir"]
threads: 10
output:
join(output_dir, "simulated_summary/tsg_sim{iter}.txt")
shell:
"probabilistic2020 --log-level=INFO tsg "
" -c 1.5 -n {params.num_sim} -b {params.data_dir}/snvboxGenes.bed "
" -m {input} -i {params.data_dir}/snvboxGenes.fa -p {threads} -d 1 "
" -o {output} "
# run probabilistic2020 oncogene statistical test on simulated MAF
rule simOg:
input:
mutations=join(output_dir, "simulated_summary/chasm_sim_maf{iter}.txt")
params:
min_recur=min_recur,
num_sim=config["NUMSIMULATIONS"],
data_dir=config["data_dir"]
threads: 10
output:
join(output_dir, "simulated_summary/oncogene_sim{iter}.txt")
shell:
"probabilistic2020 --log-level=INFO oncogene "
" -c 1.5 -n {params.num_sim} -b {params.data_dir}/snvboxGenes.bed "
" -m {input.mutations} -i {params.data_dir}/snvboxGenes.fa -p {threads} "
" --score-dir={params.data_dir}/scores -r {params.min_recur} "
" -o {output}"
# Combine the results from simOg, simTsg, and simSummary
rule simFeatures:
input:
summary=join(output_dir, "simulated_summary/chasm_sim_summary{iter}.txt"),
og=join(output_dir, "simulated_summary/oncogene_sim{iter}.txt"),
tsg=join(output_dir, "simulated_summary/tsg_sim{iter}.txt")
params:
data_dir=config["data_dir"]
output:
join(output_dir, "simulated_summary/simulated_features{iter}.txt")
shell:
"python `which 2020plus.py` features "
" -s {input.summary} --tsg-test {input.tsg} -og-test {input.og} "
" -o {output}"
# final processing of the simulation results
rule finishSim:
input:
expand(join(output_dir, "simulated_summary/simulated_features{iter}.txt"), iter=ids)
output:
join(output_dir, "simulated_summary/simulated_features.txt")
shell:
'cat {input} | awk -F"\t" \'{{OFS="\t"}} NR == 1 || !/^gene/\' - > ' + output_dir + '/simulated_summary/tmp_simulated_features.txt ; '
'cat '+output_dir+'/simulated_summary/tmp_simulated_features.txt | awk -F"\t" \'{{OFS="\t"}}{{if(NR != 1) printf (NR"\t"); if(NR!=1) for(i=2; i<NF; i++) printf ($i"\t"); if(NR != 1) print $i; if(NR==1) print $0}}\' - > {output}'
###################################
# Code for calculating results on
# actually observed mutations
###################################
# calculate summarized features for the observed mutations
rule summary:
input:
mutations=mutations
params:
min_recur=min_recur,
data_dir=config["data_dir"]
output:
join(output_dir, "summary.txt")
shell:
"mut_annotate --log-level=INFO "
" -b {params.data_dir}/snvboxGenes.bed -i {params.data_dir}/snvboxGenes.fa "
" -c 1.5 -m {input.mutations} -p 0 -n 0 --summary "
" --score-dir={params.data_dir}/scores "
" --unique -r {params.min_recur} -o {output}"
# run probabilistic2020 tsg statistical test on MAF
rule tsg:
input:
mutations
params:
num_sim=config["NUMSIMULATIONS"],
data_dir=config["data_dir"]
threads: 10
output:
join(output_dir, "tsg.txt")
shell:
"probabilistic2020 -v --log-level=INFO tsg "
" -c 1.5 -n {params.num_sim} -b {params.data_dir}/snvboxGenes.bed "
" -m {input} -i {params.data_dir}/snvboxGenes.fa -p {threads} -d 1 "
" -o {output} "
# run probabilistic2020 oncogene statistical test on MAF
rule og:
input:
mutations=mutations
params:
min_recur=min_recur,
num_sim=config["NUMSIMULATIONS"],
data_dir=config["data_dir"]
threads: 10
output:
join(output_dir, "oncogene.txt")
shell:
"probabilistic2020 -v --log-level=INFO oncogene "
" -c 1.5 -n {params.num_sim} -b {params.data_dir}/snvboxGenes.bed "
" -m {input.mutations} -i {params.data_dir}/snvboxGenes.fa -p {threads} "
" --unique --score-dir={params.data_dir}/scores -r {params.min_recur} "
" -o {output}"
# Combine the results from og, tsg, and summary
rule features:
input:
summary=join(output_dir, "summary.txt"),
og=join(output_dir, "oncogene.txt"),
tsg=join(output_dir, "tsg.txt")
params:
data_dir=config["data_dir"]
output:
join(output_dir, "features.txt")
shell:
"python `which 2020plus.py` features "
" -s {input.summary} --tsg-test {input.tsg} -og-test {input.og} "
" -o {output}"
# perform prediction by random forest
# in this case the data is pan-cancer
# and so a cross-validation loop is performed
rule cv_predict:
input:
features=join(output_dir, "features.txt"),
sim_features=join(output_dir, "simulated_summary/simulated_features.txt"),
params:
ntrees=ntrees,
ntrees2=ntrees2,
data_dir=config["data_dir"],
output_dir=config["output_dir"]
output:
join(output_dir, "output/results/r_random_forest_prediction.txt"),
join(output_dir, "trained.Rdata")
shell:
"""
python `which 2020plus.py` --log-level=INFO train -d .7 -o 1.0 -n {{params.ntrees2}} -r {outdir}/trained.Rdata --features={{input.features}} --random-seed 71
python `which 2020plus.py` --log-level=INFO classify --trained-classifier {outdir}/trained.Rdata --null-distribution {outdir}/simulated_null_dist.txt --features {{input.sim_features}} --simulated
python `which 2020plus.py` --out-dir {outdir}/output --log-level=INFO classify -n {{params.ntrees}} -d .7 -o 1.0 --features {{input.features}} --null-distribution {outdir}/simulated_null_dist.txt --random-seed 71
""".format(outdir=output_dir)
#############################
# Rules for just training on
# pan-cancer data
#############################
rule train_pancan:
input:
features=join(output_dir, "features.txt")
params:
ntrees=ntrees,
data_dir=config["data_dir"],
output_dir=config["output_dir"]
output:
join(output_dir, "2020plus.Rdata")
shell:
"""
python `which 2020plus.py` --log-level=INFO train -d .7 -o 1.0 -n {{params.ntrees}} --features={{input.features}} {cv} --random-seed 71 -r {outdir}/2020plus.Rdata
""".format(outdir=output_dir, cv=cv)
#############################
# Rules for predicting using
# a trained classifier on a separate
# mutation data set
#############################
rule predict_test:
input:
trained_classifier=trained_classifier,
features=join(output_dir, "features.txt"),
sim_features=join(output_dir, "simulated_summary/simulated_features.txt"),
params:
ntrees=ntrees,
output:
join(output_dir, "pretrained_output/results/r_random_forest_prediction.txt")
shell:
"""
python `which 2020plus.py` --log-level=INFO classify --trained-classifier {{input.trained_classifier}} --null-distribution {outdir}/simulated_null_dist.txt --features {{input.sim_features}} --simulated {cv}
python `which 2020plus.py` --out-dir {outdir}/pretrained_output --log-level=INFO classify -n {{params.ntrees}} --trained-classifier {{input.trained_classifier}} -d .7 -o 1.0 --features {{input.features}} --null-distribution {outdir}/simulated_null_dist.txt --random-seed 71 {cv}
""".format(outdir=output_dir, cv=cv)