-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
151 lines (131 loc) · 4.67 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
nextcladev3 = "~/Projects/nextstrain/nextclade_dev/target/release/nextclade"
genemapv3 = "~/Projects/nextstrain/nextclade_dev/target/release/genemap"
genes = ["E", "M", "N", "ORF1a", "ORF1b", "ORF3a", "ORF6", "ORF7a", "ORF7b", "ORF8", "ORF9b", "S"]
rule all:
input:
"auspice/tree.json"
rule get_nextclade_dataset:
output:
directory("sars-cov-2_dataset")
shell:
"""
{nextcladev3} dataset get --name sars-cov-2 --output-dir {output}
"""
rule nextclade_annotation:
input:
dataset = directory("sars-cov-2_dataset")
output:
"data/annotation.json"
shell:
"""
{genemapv3} {input.dataset}/genemap.gff --json > {output}
"""
rule fetch_data:
output:
sequences = "data/sequences.fasta.xz",
metadata = "data/metadata.tsv",
shell:
"""
curl https://data.nextstrain.org/files/ncov/open/global/sequences.fasta.xz -o {output.sequences}
curl https://data.nextstrain.org/files/ncov/open/global/metadata.tsv.xz | xz -d > {output.metadata}
"""
rule fetch_config:
output:
auspice_config = "data/auspice_config.json",
lat_longs = "data/lat_longs.tsv"
shell:
"""
curl https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/auspice_config.json -o {output.auspice_config}
curl https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/lat_longs.tsv -o {output.lat_longs}
"""
rule nextclade:
input:
sequences = "data/sequences.fasta.xz",
dataset = directory("sars-cov-2_dataset")
output:
"results/nextclade.aligned.fasta",
"results/nextclade.tsv",
"results/nextclade.nwk"
params:
out_dir = "results"
threads: 4
shell:
"""
{nextcladev3} run -D {input.dataset} -j {threads} --output-all {params.out_dir} {input.sequences}
"""
rule prune:
input:
tree = "results/nextclade.nwk",
nextclade_tsv = "results/nextclade.tsv"
output:
tree = "results/tree_raw.nwk"
run:
import pandas as pd
from Bio import Phylo
tree = Phylo.read(input.tree, "newick")
metadata = pd.read_csv(input.nextclade_tsv, sep="\t", index_col='seqName')
for leaf in tree.get_terminals():
if leaf.name not in metadata.index or metadata.loc[leaf.name, 'qc.overallStatus'] == 'bad':
tree.prune(leaf)
for clade in tree.find_clades():
clade.branch_length /= 29903
Phylo.write(tree, output.tree, "newick", format_branch_length="%1.8f")
rule refine:
input:
tree = "results/tree_raw.nwk",
metadata = "data/metadata.tsv",
alignment = "results/nextclade.aligned.fasta"
output:
tree = "results/tree.nwk",
node_data = "results/branch_length.json"
shell:
"""
augur refine --tree {input.tree} --metadata {input.metadata} --alignment {input.alignment} \
--timetree --coalescent opt --date-confidence \
--clock-rate 0.0006 --clock-std-dev 0.0002 \
--stochastic-resolve --keep-root \
--output-node-data {output.node_data} --output-tree {output.tree}
"""
rule ancestral:
input:
tree = "results/tree.nwk",
alignment = "results/nextclade.aligned.fasta",
annotation = "data/annotation.json"
output:
node_data = "results/mutations.json"
params:
genes = genes,
translations = "results/nextclade_gene_%GENE.translation.fasta"
shell:
"""
augur ancestral --tree {input.tree} --alignment {input.alignment} \
--root-sequence sars-cov-2_dataset/reference.fasta --annotation {input.annotation} \
--output-node-data {output.node_data} \
--genes {params.genes} --translations {params.translations}
"""
rule export:
input:
tree = "results/tree.nwk",
node_data = ["results/branch_length.json", "results/mutations.json"],
metadata = "data/metadata.tsv",
auspice_config = "data/auspice_config.json",
lat_longs = "data/lat_longs.tsv"
output:
"auspice/tree.json"
shell:
"""
augur export v2 --tree {input.tree} --node-data {input.node_data} --metadata {input.metadata} \
--lat-longs data/lat_longs.tsv --color-by-metadata Nextstrain_clade pango_lineage \
--auspice-config data/auspice_config.json --output {output}
"""
rule clean:
shell:
"""
rm -rf results auspice
"""
rule clobber:
shell:
"""
rm -rf sars-cov-2_dataset
rm -rf results auspice
"""