-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtrainworker.sh
153 lines (130 loc) · 4.96 KB
/
trainworker.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/bin/bash
script_name=$(basename "$0")
script_dir=$(dirname "$0")
version="???"
if [ -f "$script_dir/VERSION" ] ; then
version=$(cat "$script_dir"/VERSION)
fi
numiterations="30000"
gpu="all"
one_fmonly=false
base_lr="1e-02"
power="0.8"
momentum="0.9"
weight_decay="0.0005"
average_loss="16"
lr_policy="poly"
iter_size="8"
snapshot_interval="2000"
model_list="1fm,3fm,5fm"
function usage()
{
echo "usage: $script_name [-h] [--models MODELS]
[--numiterations NUMITERATIONS]
[--gpu GPU] [--base_lr BASE_LR] [--power POWER]
[--momentum MOMENTUM]
[--weight_decay WEIGHT_DECAY]
[--average_loss AVERAGE_LOSS]
[--lr_policy POLICY] [--iter_size ITER_SIZE]
[--snapshot_interval SNAPSHOT_INTERVAL]
trainoutdir
Version: $version
Runs caffe training on CDeep3M model in <trainoutdir>
directory.
For further information about parameters below please see:
https://github.com/BVLC/caffe/wiki/Solver-Prototxt
optional arguments:
-h, --help show this help message and exit
--models Only train on models specified in comma
delimited list. (default 1fm,3fm,5fm)
--gpu Which GPU to use, can be a number ie 0 or 1 or
all to use all GPUs (default $gpu)
--base_learn Base learning rate (default $base_lr)
--power Used in poly and sigmoid lr_policies. (default $power)
--momentum Indicates how much of the previous weight will be
retained in the new calculation. (default $momentum)
--weight_decay Factor of (regularization) penalization of large
weights (default $weight_decay)
--average_loss Number of iterations to use to average loss
(default $average_loss)
--lr_policy Learning rate policy (default $lr_policy)
--iter_size Accumulate gradients across batches through the
iter_size solver field. (default $iter_size)
--snapshot_interval How often caffe should output a model and solverstate.
(default $snapshot_interval)
--numiterations Number of training iterations to run (default $numiterations)
" 1>&2;
exit 1;
}
TEMP=$(getopt -o h --long "models:,gpu:,numiterations:,base_learn:,power:,momentum:,weight_decay:,average_loss:,lr_policy:,iter_size:,snapshot_interval:" -n "$0" -- "$@")
eval set -- "$TEMP"
while true ; do
case "$1" in
-h ) usage ;;
--models ) model_list=$2 ; shift 2 ;;
--numiterations ) numiterations=$2 ; shift 2 ;;
--gpu ) gpu=$2 ; shift 2 ;;
--base_learn ) base_lr=$2 ; shift 2 ;;
--power ) power=$2 ; shift 2 ;;
--momentum ) momentum=$2 ; shift 2 ;;
--weight_decay ) weight_decay=$2 ; shift 2 ;;
--average_loss ) average_loss=$2 ; shift 2 ;;
--lr_policy ) lr_policy=$2 ; shift 2 ;;
--iter_size ) iter_size=$2 ; shift 2 ;;
--snapshot_interval ) snapshot_interval=$2 ; shift 2 ;;
--) shift ; break ;;
esac
done
if [ $# -ne 1 ] ; then
usage
fi
trainoutdir=$1
echo ""
(( maxgpuindex=0 ))
gpucount=$(nvidia-smi -L | wc -l)
if [ "$gpucount" -eq 0 ] ; then
echo "ERROR unable to get count of GPU(s). Is nvidia-smi working?"
exit 4
fi
(( maxgpuindex=$gpucount-1 ))
if [ $maxgpuindex -gt 0 ] ; then
echo -n "Detected $gpucount GPU(s)."
if [ "$gpu" == "all" ] ; then
echo " Will run in parallel."
else
echo " Using only GPU $gpu"
fi
else
echo "Single GPU detected."
fi
if [ "$gpu" == "all" ] ; then
(( cntr=0 ))
else
(( cntr=$gpu ))
(( gpucount=1 ))
fi
parallel_job_file="$trainoutdir/parallel.jobs"
for model_name in $(echo "$model_list" | sed "s/,/ /g") ; do
if [ ! -d "$trainoutdir/$model_name" ] ; then
echo "ERROR, no $trainoutdir/$model_name directory found."
exit 2
fi
echo -e "$numiterations\n$cntr\n$base_lr\n$power\n$momentum\n$weight_decay\n$average_loss\n$lr_policy\n$iter_size\n$snapshot_interval\n$model_name\n$trainoutdir" >> "$parallel_job_file"
if [ "$gpu" == "all" ] ; then
(( cntr++ ))
if [ $cntr -gt $maxgpuindex ] ; then
(( cntr=0 ))
fi
fi
done
# the --delay 2 is to add a 2 second delay between starting jobs
# without this jobs would fail on GPU with out of memory error
#
cat "$parallel_job_file" | parallel --no-notice --delay 2 -N 12 -j $gpucount "${script_dir}"/caffetrain.sh --numiterations {1} --gpu {2} --base_learn {3} --power {4} --momentum {5} --weight_decay {6} --average_loss {7} --lr_policy {8} --iter_size {9} --snapshot_interval {10} {11} {12}
if [ $? != 0 ] ; then
echo "Non zero exit code from caffe for train of model. Exiting."
exit 1
fi
echo ""
echo "Training has completed. Have a nice day!"
echo ""