-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.sh
207 lines (166 loc) · 7.17 KB
/
main.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/bin/bash
function parse_yaml {
local prefix=$2
local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
sed -ne "s|^\($s\):|\1|" \
-e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
-e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 |
awk -F$fs '{
indent = length($1)/2;
vname[indent] = $2;
for (i in vname) {if (i > indent) {delete vname[i]}}
if (length($3) > 0) {
vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
}
}'
}
eval $(parse_yaml Config.yaml)
export PATH="$PWD"/src:$PATH
export PYTHONPATH=$PWD
SCRIPT=$(realpath "$0")
SCRIPTPATH=$(dirname "$SCRIPT")
################################################
#PART 1. Install 7 pieces of conda environments
################################################
### note: if you use GPU for NN model, you need to install pytorch in the multi_torch_env env.
### To install pytorch compatible with your CUDA version, please fellow this instruction: https://pytorch.org/get-started/locally/.
### Our code was tested with pytorch v1.7.1, with CUDA Version: 10.1 and 11.0 .
#
bash ./install/install.sh
echo "Please check if Env created."
#-------------------------------------------
#2.PATRIC Data
#-------------------------------------------
### The tutorials on creating datasets from scratch can be found here: https://github.com/hzi-bifo/AMR_benchmarking/edit/main/data/README.md
### You don't need to create the benchmarking datasets, we have incorporated them in this GitHub repo
## 2.1 PATRIC Data download
bash ./scripts/data_preprocess/retrieve_PATRIC_data.sh ${dataset_location}
##############################
#3. Software 1. Resfinder
##############################
#optional: Blastn-based version can be installed from https://bitbucket.org/genomicepidemiology/resfinder/src/master/
# native KMA-based version can only process read data(FASTQ files).
# Here we provided the modified version of KMA-based Resfinder that can use genomic data. This is done because NN multi-species model (Aytan-Aktug et al.)
# can only be generated by KMA-based Resfinder.
#reference database version 2021-05-06. You can also update the ref database (hopefully will get better performance.)
## install KMA and Resfinder
#If issues arise in this step, you can alternatively manually install it.
# please further refer to https://bitbucket.org/genomicepidemiology/resfinder/src/master/
cd ./AMR_software/resfinder
cd cge
unzip kma.zip
##git clone https://bitbucket.org/genomicepidemiology/kma.git
cd kma && make
cd ${SCRIPTPATH}
#### Reference database version 2021-05-06. You can also downlaoding the latest version from the ResFinder website.
cd ./AMR_software/resfinder
unzip db_pointfinder.zip
unzip db_resfinder.zip
cd ${SCRIPTPATH}
###index Point-/ResFinder databases with KMA
cd ./AMR_software/resfinder/db_resfinder
python3 INSTALL.py ${SCRIPTPATH}/AMR_software/resfinder/cge/kma/kma non_interactive
cd ${SCRIPTPATH}
cd ./AMR_software/resfinder/db_pointfinder
python3 INSTALL.py ${SCRIPTPATH}/AMR_software/resfinder/cge/kma/kma non_interactive
cd ${SCRIPTPATH}
bash ./scripts/model/resfinder.sh
##############################
###4. Software 2. Aytan-Aktug. Adaption version.
###Note: SSSA model support both CPU parallelization running and GPU sequential running. (via gpu_on in Config.yaml)
### Other 4 multi- models are designed only for GPU running due to heavy computing load (although it can still run on CPU machines without needing to to anything.).
### You can further tear each of them into smaller running jobs by assigning i and j variables to a specific value within the range specified in each corresponding script.
### an example of tearing to smaller tasks is explained in 4.2 ./scripts/model/AytanAktug_SSMA.sh script.
### note: 4.4 and 4.5 are based on some intermediate feature files from 4.3, so please run the 3 multi-species models feature generation part sequentially.
##Reference: Early stopping for PyTorch https://github.com/Bjarten/early-stopping-pytorch
##############################
#### 4.1 single-species single-antibiotics
bash ./scripts/model/AytanAktug_SSSA.sh
#
#### 4.2 single-species multi-antibiotics
bash ./scripts/model/AytanAktug_SSMA.sh
###4.3 discrete databases multi-species model
bash ./scripts/model/AytanAktug_MSMA_discrete.sh
## 4.4 concatenated databases mixed(-species) multi-species model
### 4.5 concatenated databases leave-one(-species)-out multi-species model
bash ./scripts/model/AytanAktug_MSMA_concat.sh
###############################
##5. Software 3.Seq2Geno2Pheno
###############################
## set up snakemake pipeline.
cd ${SCRIPTPATH}
cd ./AMR_software/seq2geno/install/
./SETENV.sh ${se2ge_env_name}
echo "Seq2Geno main env set up. Now proceed to set up denovo envs.."
export PATH=$( dirname $( dirname $( /usr/bin/which conda ) ) )/bin:$PATH
export PYTHONPATH=$PWD
source activate ${se2ge_env_name}
wait
cd ${SCRIPTPATH}
cd ./AMR_software/seq2geno/install/
./TESTING.sh # dry run and set Roary dependencies
conda deactivate
## Then update seq2geno to the adaption version that can deal with genomic data
cd ${SCRIPTPATH}
cd ./AMR_software/
cp -r seq2geno_assemble/* seq2geno/
wait
source activate ${phylo_name} #install R packages
Rscript --vanilla ./install/phylo_env.r
conda deactivate
cd ${SCRIPTPATH}
bash ./scripts/model/seq2geno.sh #Run.
###############################
##6. Software 4. Phenotyperseeker
###############################
bash ./scripts/model/phenotypeseeker.sh
#
###############################
##7. Software 5. Kover
###############################
### Please install Kover 2.0 according to https://aldro61.github.io/kover/doc_installation.html or https://github.com/aldro61/kover
### We used the command line version in Linux.
###install
source activate ${kover_env_name}
cd ./AMR_software/Kover/
bash ./install.sh
conda deactivate
cd ${SCRIPTPATH}
bash ./scripts/model/kover.sh
###############################
##7. Software 6. ML baseline (majority)
###############################
bash ./scripts/model/majority.sh
#
#
########################################
###8. Kover, PhenotypeSeeker cross-species models.
########################################
bash ./scripts/model/phenotypeseeker_MS.sh
bash ./scripts/model/kover_MS.sh
#-------------------------------------------
#9. Main Analysis and Visualiztion
#-------------------------------------------
bash ./scripts/analysis_visualization/multiModel_analysis.sh
bash ./scripts/analysis_visualization/compare.sh
#-------------------------------------------
#110. Other Visualiztion(supplements)
#-------------------------------------------
bash ./scripts/analysis_visualization/compare_supplement.sh
echo "Please find the results(tables, figures, and statistic numbers mentioned at the AMR benchmarking article) at the location set by Config.yaml
.
└── Results
├── final_figures_tables
├── other_figures_tables
├── supplement_figures_tables
└── software
│── AytanAktug
├── kover
├── majority
├── phenotypeseeker
├── resfinder_b
├── resfinder_folds
├── resfinder_k
└── seq2geno
"