Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
gudiandian committed Nov 1, 2022
1 parent 7a2229c commit 9194b14
Show file tree
Hide file tree
Showing 12 changed files with 4,048 additions and 15,095 deletions.
2 changes: 1 addition & 1 deletion ElasticFlow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ python convert_ef_trace_to_chronus.py -t ../../traces_for_ElasticFlow/195job_end
python get_name_list.py -t ../../traces_for_chronus/195job_endtoend_trace.csv -o ../../traces_for_chronus/195job_endtoend_trace.lst
# run scheduler
cd ..
python main.py --schedule=time-aware-with-lease --trace=../traces_for_chronus/195job_endtoend_trace.csv --save_log_dir=../../plot_figure/logs/figureba/chronus --ident=chronus --aggressive=True --mip_objective=adaptive --placement=local_search --profile=True --check_time_interval=240 --disable_turn_off=True --num_node_p_switch=16 --lease_term_interval=240 --name_list=../traces_for_chronus/195job_endtoend_trace.lst --num_gpu_p_node=8 --gpu_type=A100
python main.py --schedule=time-aware-with-lease --trace=../traces_for_chronus/195job_endtoend_trace.csv --save_log_dir=../../plot_figure/logs/figure6b/chronus --ident=chronus --aggressive=True --mip_objective=adaptive --placement=local_search --profile=True --check_time_interval=240 --disable_turn_off=True --num_node_p_switch=16 --lease_term_interval=240 --name_list=../traces_for_chronus/195job_endtoend_trace.lst --num_gpu_p_node=8 --gpu_type=A100
```

It takes a few hours for each setting.
Expand Down
4 changes: 2 additions & 2 deletions ElasticFlow/chronus-scheduler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,15 @@ def summary_all_jobs():
fd = open(opt.save_log_dir + '/final_result.csv', 'a+')
log_writer = csv.writer(fd)
log_writer.writerow([time.strftime("%Y%m%d-%H-%M-%S", time.localtime()),
str(float(accepted_jobs)/(accepted_jobs+declined_jobs)),
str(float(satisfied)/ddl_jobs),
"n"+str(opt.num_node_p_switch)+"g"+str(opt.num_gpu_p_node),
opt.trace, "chronus", "elastic"])
else:
fd = open(opt.save_log_dir + '/final_result.csv', 'w+')
log_writer = csv.writer(fd)
log_writer.writerow(['time', 'ddl_satis_ratio', 'cluster_spec', 'trace_file', 'scheduler', 'scheme'])
log_writer.writerow([time.strftime("%Y%m%d-%H-%M-%S", time.localtime()),
str(float(accepted_jobs)/(accepted_jobs+declined_jobs)),
str(float(satisfied)/ddl_jobs),
"n"+str(opt.num_node_p_switch)+"g"+str(opt.num_gpu_p_node),
opt.trace, "chronus", "elastic"])
fd.close()
Expand Down
3 changes: 2 additions & 1 deletion ElasticFlow/scheduler/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def init_log(self):
fd.close()
fd = open(self.log_gpu, 'w+')
log_writer = csv.writer(fd)
log_writer.writerow(['time'] + ['gpu'+str(i) for i in range(CLUSTER.num_node)] + ['ce'])
log_writer.writerow(['time'] + ['gpu'+str(i) for i in range(CLUSTER.num_node)])
fd.close()
fd = open(self.log_network, 'w+')
log_writer = csv.writer(fd)
Expand Down Expand Up @@ -120,6 +120,7 @@ def init_log(self):
for i in range(CLUSTER.num_node):
for j in range(CLUSTER.num_gpu_p_node):
row += ['gpu' + str(i) + '-' + str(j)]
row += ['ce']
log_writer.writerow(row)
fd.close()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ R 0 bert 64 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 bert 64 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand All @@ -19,7 +19,7 @@ R 0 bert 64 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 bert 64 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ R 0 deepspeech2 64 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 deepspeech2 64 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand All @@ -19,7 +19,7 @@ R 0 deepspeech2 64 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 deepspeech2 64 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ R 0 gpt2 128 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 gpt2 128 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand All @@ -19,7 +19,7 @@ R 0 gpt2 128 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 gpt2 128 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ R 0 inception3 128 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 inception3 128 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand All @@ -19,7 +19,7 @@ R 0 inception3 128 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 inception3 128 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ R 0 resnet50 128 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 resnet50 128 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand All @@ -19,7 +19,7 @@ R 0 resnet50 128 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 resnet50 128 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ R 0 vgg16 128 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 vgg16 128 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand All @@ -19,7 +19,7 @@ R 0 vgg16 128 0 8 1 0 127.0.0.1 22225 1 333 1 0
F
T
W 20
K 1 0
K 0 0
W 5
G 1 0 0 1
R 0 vgg16 128 0 8 1 0 127.0.0.1 22225 3 333 1 0 1
Expand Down
4 changes: 2 additions & 2 deletions ElasticFlow/scheduler/run_fig8b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ for job in ${jobs[@]};do
fi
log_name="${log_folder}/${s}_${job}"
mkdir $log_name
python3 scheduler.py --cluster_spec=${cluster_spec} --print --scheme=${placement} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --simulation=True --scheduling_slot=240 --gpu_type=A100&
python3 scheduler.py --cluster_spec=${cluster_spec} --print --scheme=${placement} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --simulation=True --scheduling_slot=60 --gpu_type=A100&
done
done

Expand All @@ -50,7 +50,7 @@ for job in ${jobs[@]};do
chronus_job_file="../traces_for_chronus/${job}.csv"
chronus_namelist_file="../traces_for_chronus/${job}.lst"
save_log_dir="../../plot_figure/logs/figure8b/chronus_${job}"
python3 main.py --schedule=time-aware-with-lease --trace=${chronus_job_file} --save_log_dir=${save_log_dir} --ident=chronus --aggressive=True --mip_objective=adaptive --placement=local_search --profile=True --check_time_interval=240 --disable_turn_off=True --num_node_p_switch=${num_node} --lease_term_interval=240 --name_list=${chronus_namelist_file} --simulation=True --gpu_type=A100 --num_gpu_p_node=8&
python3 main.py --schedule=time-aware-with-lease --trace=${chronus_job_file} --save_log_dir=${save_log_dir} --ident=chronus --aggressive=True --mip_objective=adaptive --placement=local_search --profile=True --check_time_interval=60 --disable_turn_off=True --num_node_p_switch=${num_node} --lease_term_interval=240 --name_list=${chronus_namelist_file} --simulation=True --gpu_type=A100 --num_gpu_p_node=8&
cd utils
done
cd ../../scheduler
Loading

0 comments on commit 9194b14

Please sign in to comment.