From 22d8d7b6ad1ba926822e5751b41e1ad6249db5b1 Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Mon, 28 Sep 2020 16:37:32 +0800 Subject: [PATCH 1/4] add scripts --- Classification/cnns/train_fp16.sh | 52 +++++++++++++++++++++++++++++++ Classification/cnns/train_fp32.sh | 46 +++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100755 Classification/cnns/train_fp16.sh create mode 100755 Classification/cnns/train_fp32.sh diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh new file mode 100755 index 0000000..cda1d15 --- /dev/null +++ b/Classification/cnns/train_fp16.sh @@ -0,0 +1,52 @@ +rm -rf core.* +rm -rf ./output/snapshots/* + +if [ -n "$1" ]; then + NUM_EPOCH=$1 +else + NUM_EPOCH=50 +fi +echo NUM_EPOCH=$NUM_EPOCH + +# training with imagenet +if [ -n "$2" ]; then + DATA_ROOT=$2 +else + DATA_ROOT=/data/imagenet/ofrecord +fi +echo DATA_ROOT=$DATA_ROOT + +LOG_FOLDER=../logs +mkdir -p $LOG_FOLDER +LOGFILE=$LOG_FOLDER/resnet_training.log + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE + +python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ + --num_nodes=1 \ + --gpu_num_per_node=8 \ + --optimizer="sgd" \ + --momentum=0.875 \ + --label_smoothing=0.1 \ + --learning_rate=1.024 \ + --loss_print_every_n_iter=100 \ + --batch_size_per_device=128 \ + --val_batch_size_per_device=50 \ + --use_fp16 \ + --channel_last=True \ + --pad_output \ + --fuse_bn_relu=True \ + --fuse_bn_add_relu=True \ + --nccl_fusion_threshold_mb=16 \ + --nccl_fusion_max_ops=24 \ + --num_epoch=$NUM_EPOCH \ + --model="resnet50" 2>&1 | tee ${LOGFILE} + +echo "Writting log to ${LOGFILE}" diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh new file mode 100755 index 0000000..e8033a0 --- /dev/null +++ b/Classification/cnns/train_fp32.sh @@ -0,0 +1,46 @@ +rm -rf core.* +rm -rf ./output/snapshots/* + +if [ -n "$1" ]; then + NUM_EPOCH=$1 +else + NUM_EPOCH=50 +fi +echo NUM_EPOCH=$NUM_EPOCH + +# training with imagenet +if [ -n "$2" ]; then + DATA_ROOT=$2 +else + DATA_ROOT=/data/imagenet/ofrecord +fi +echo DATA_ROOT=$DATA_ROOT + +LOG_FOLDER=../logs +mkdir -p $LOG_FOLDER +LOGFILE=$LOG_FOLDER/resnet_training.log + +python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ + --num_nodes=1 \ + --gpu_num_per_node=8 \ + --optimizer="sgd" \ + --momentum=0.875 \ + --label_smoothing=0.1 \ + --learning_rate=1.024 \ + --loss_print_every_n_iter=100 \ + --batch_size_per_device=128 \ + --val_batch_size_per_device=50 \ + --channel_last=True \ + --pad_output \ + --fuse_bn_relu=True \ + --fuse_bn_add_relu=True \ + --nccl_fusion_threshold_mb=16 \ + --nccl_fusion_max_ops=24 \ + --num_epoch=$NUM_EPOCH \ + --model="resnet50" 2>&1 | tee ${LOGFILE} + +echo "Writting log to ${LOGFILE}" From 7142e2f2d57f7a6defd9ed69d68d04e7b209162e Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Mon, 28 Sep 2020 17:51:07 +0800 Subject: [PATCH 2/4] add env --- Classification/cnns/train_fp32.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh index e8033a0..a7e5882 100755 --- a/Classification/cnns/train_fp32.sh +++ b/Classification/cnns/train_fp32.sh @@ -20,6 +20,11 @@ LOG_FOLDER=../logs mkdir -p $LOG_FOLDER LOGFILE=$LOG_FOLDER/resnet_training.log +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE + python3 of_cnn_train_val.py \ --train_data_dir=$DATA_ROOT/train \ --train_data_part_num=256 \ From d45ad7f7799fc57b4a2a16d8e5dc4acd5dcbd9e6 Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Tue, 29 Sep 2020 11:39:22 +0800 Subject: [PATCH 3/4] fp32 channel_last false --- Classification/cnns/train_fp32.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh index a7e5882..045467f 100755 --- a/Classification/cnns/train_fp32.sh +++ b/Classification/cnns/train_fp32.sh @@ -39,8 +39,7 @@ python3 of_cnn_train_val.py \ --loss_print_every_n_iter=100 \ --batch_size_per_device=128 \ --val_batch_size_per_device=50 \ - --channel_last=True \ - --pad_output \ + --channel_last=False \ --fuse_bn_relu=True \ --fuse_bn_add_relu=True \ --nccl_fusion_threshold_mb=16 \ From 5e0054ee7a10e75d2ad3100311084158c22fc6cf Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Tue, 29 Sep 2020 15:18:33 +0800 Subject: [PATCH 4/4] fp32 batch size --- Classification/cnns/train_fp32.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh index 045467f..ce06884 100755 --- a/Classification/cnns/train_fp32.sh +++ b/Classification/cnns/train_fp32.sh @@ -35,9 +35,9 @@ python3 of_cnn_train_val.py \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ - --learning_rate=1.024 \ + --learning_rate=0.512 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=128 \ + --batch_size_per_device=64 \ --val_batch_size_per_device=50 \ --channel_last=False \ --fuse_bn_relu=True \