From 990409c8eaeebf25b3962cc0ed0d8a6aaf4eb906 Mon Sep 17 00:00:00 2001 From: wjm202 <897383984@qq.com> Date: Tue, 12 Sep 2023 17:46:43 +0800 Subject: [PATCH 01/16] add blip2 torch benchmark --- .gitmodules | 3 + .../pytorch/dynamic/PaddleMIX/models/LAVIS | 1 + .../blip2-opt-2.7b-stage2_bs128_fp16_DP.sh | 27 +++++ .../blip2-opt-2.7b-stage2_bs32_fp16_DP.sh | 27 +++++ .../blip2-opt-2.7b-stage2_bs64_fp16_DP.sh | 27 +++++ .../blip2-opt-2.7b-stage2_bs128_fp16_DP.sh | 27 +++++ .../blip2-opt-2.7b-stage2_bs32_fp16_DP.sh | 27 +++++ .../blip2-opt-2.7b-stage2_bs64_fp16_DP.sh | 27 +++++ .../PaddleMIX/scripts/blip2/analysis_log.py | 74 +++++++++++++ .../PaddleMIX/scripts/blip2/prepare.sh | 10 ++ .../PaddleMIX/scripts/blip2/requirements.txt | 29 +++++ .../PaddleMIX/scripts/blip2/run_benchmark.sh | 102 +++++++++++++++++ .../dynamic/PaddleMIX/scripts/blip2/train.py | 104 ++++++++++++++++++ 13 files changed, 485 insertions(+) create mode 160000 frame_benchmark/pytorch/dynamic/PaddleMIX/models/LAVIS create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/analysis_log.py create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/requirements.txt create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh create mode 100644 frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/train.py diff --git a/.gitmodules b/.gitmodules index d0433183ae..71ad114f26 100644 --- a/.gitmodules +++ b/.gitmodules @@ -148,3 +148,6 @@ [submodule "frame_benchmark/pytorch/dynamic/PaddleMIX/models/diffusers"] path = frame_benchmark/pytorch/dynamic/PaddleMIX/models/diffusers url = https://github.com/huggingface/diffusers +[submodule "frame_benchmark/pytorch/dynamic/PaddleMIX/models/LAVIS"] + path = frame_benchmark/pytorch/dynamic/PaddleMIX/models/LAVIS + url = https://github.com/wjm202/LAVIS.git diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/models/LAVIS b/frame_benchmark/pytorch/dynamic/PaddleMIX/models/LAVIS new file mode 160000 index 0000000000..d139554961 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/models/LAVIS @@ -0,0 +1 @@ +Subproject commit d1395549618ea855d3a5ad23e07b742353c8b644 diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh new file mode 100644 index 0000000000..fe334f132b --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh @@ -0,0 +1,27 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=blip2-opt-2.7b-stage2 +bs_item=128 +fp_item=fp16 +run_process_type=SingleP +run_mode=DP +device_num=N1C1 +max_epochs=10 +num_workers=1 + +#get data +bash prepare.sh +#run +bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh new file mode 100644 index 0000000000..f787fc8147 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh @@ -0,0 +1,27 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=blip2-opt-2.7b-stage2 +bs_item=32 +fp_item=fp16 +run_process_type=SingleP +run_mode=DP +device_num=N1C1 +max_epochs=1 +num_workers=1 + +#get data +bash scripts/blip2/prepare.sh +#run +bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh new file mode 100644 index 0000000000..de22a1d819 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh @@ -0,0 +1,27 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=blip2-opt-2.7b-stage2 +bs_item=64 +fp_item=fp16 +run_process_type=SingleP +run_mode=DP +device_num=N1C1 +max_epochs=10 +num_workers=1 + +#get data +bash prepare.sh +#run +bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh new file mode 100644 index 0000000000..df9252b77d --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh @@ -0,0 +1,27 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=blip2-opt-2.7b-stage2 +bs_item=32 +fp_item=fp32 +run_process_type=MultiP +run_mode=DP +device_num=N1C1 +max_epochs=10 +num_workers=1 + +#get data +bash prepare.sh +#run +bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh new file mode 100644 index 0000000000..c84c5e17d1 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh @@ -0,0 +1,27 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=blip2-opt-2.7b-stage2 +bs_item=128 +fp_item=fp16 +run_process_type=MultiP +run_mode=DP +device_num=N1C1 +max_epochs=10 +num_workers=1 + +#get data +bash prepare.sh +#run +bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh new file mode 100644 index 0000000000..2bcd703603 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh @@ -0,0 +1,27 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=blip2-opt-2.7b-stage2 +bs_item=32 +fp_item=fp32 +run_process_type=MultiP +run_mode=DP +device_num=N1C1 +max_iter=10 +num_workers=1 + +#get data +bash prepare.sh +#run +bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/analysis_log.py b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/analysis_log.py new file mode 100644 index 0000000000..1b83755cec --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/analysis_log.py @@ -0,0 +1,74 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +import json +import os +import re +import sys + +import numpy as np + + +def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item, run_process_type): + with open(str(log_file), "r", encoding="utf8") as f: + data = f.readlines() + ips_lines = [] + for eachline in data: + if "ips:" in eachline: + ips = float(eachline.split("ips: ")[1].split()[0]) + ips_lines.append(ips) + ips = np.mean(ips_lines[4:]) + ngpus = int(re.findall("\d+", device_num)[-1]) + ips *= ngpus + run_mode = "DP" + + model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode + info = { + "model_branch": os.getenv("model_branch"), + "model_commit": os.getenv("model_commit"), + "model_name": model_name, + "batch_size": bs, + "fp_item": fp_item, + "run_mode": run_mode, + "convergence_value": 0, + "convergence_key": "", + "ips": ips, + "speed_unit": "sample/sec", + "device_num": device_num, + "model_run_time": os.getenv("model_run_time"), + "frame_commit": "", + "frame_version": os.getenv("frame_version"), + } + json_info = json.dumps(info) + print(json_info) + with open(res_log_file, "w") as of: + of.write(json_info) + + +if __name__ == "__main__": + if len(sys.argv) != 8: + print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file") + sys.exit() + + model_item = sys.argv[1] + log_file = sys.argv[2] + res_log_file = sys.argv[3] + device_num = sys.argv[4] + bs = int(sys.argv[5]) + fp_item = sys.argv[6] + run_process_type = sys.argv[7] + + analyze(model_item, log_file, res_log_file, device_num, bs, fp_item, run_process_type) \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh new file mode 100644 index 0000000000..7ef87e0503 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh @@ -0,0 +1,10 @@ + +rm -rf /export/home/.cache/lavis/coco/ +# dataset +wget https://paddlenlp.bj.bcebos.com/models/community/paddlemix/benchmark/blip2/coco.tar.gz +tar -zxvf coco.tar.gz +mv coco /export/home/.cache/lavis/ +rm -rf coco +rm -rf coco.tar.gz +# env +# pip install -r scripts/blip2/requirements.txt diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/requirements.txt b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/requirements.txt new file mode 100644 index 0000000000..9db1dc9308 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/requirements.txt @@ -0,0 +1,29 @@ +contexttimer +decord +diffusers<=0.16.0 +einops>=0.4.1 +fairscale==0.4.4 +ftfy +iopath +ipython +omegaconf +opencv-python-headless==4.5.5.64 +opendatasets +packaging +pandas +plotly +pre-commit +pycocoevalcap +pycocotools +python-magic +scikit-image +sentencepiece +spacy +streamlit +timm==0.4.12 +torch>=1.10.0 +torchvision +tqdm +transformers>=4.28.0 +webdataset +wheel diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh new file mode 100644 index 0000000000..54bf19be94 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Test training benchmark for a model. +# Usage: CUDA_VISIBLE_DEVICES=xxx bash run_benchmark.sh ${model_name} ${run_mode} ${fp_item} ${bs_item} ${max_iter} ${num_workers} + +function _set_params(){ + model_item=${1:-"blip2-opt2.7b-stage2"} # (必选) 模型 item |fastscnn|segformer_b0| ocrnet_hrnetw48 + base_batch_size=${2:-"2"} # (必选) 每张卡上的batch_size + fp_item=${3:-"fp32"} # (必选) fp32|fp16|bf16 + run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP + run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C8 (4机32卡) + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="blip2" # (必选) 模型套件的名字 + speed_unit="sample/sec" # (必选)速度指标单位 + skip_steps=4 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + max_epochs=${7:-"1"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件 或是max_epoch + num_workers=${8:-"3"} # (可选) + + # Added for distributed training + node_num=${9:-"2"} #(可选) 节点数量 + node_rank=${10:-"0"} # (可选) 节点rank + master_addr=${11:-"127.0.0.1"} # (可选) 主节点ip地址 + master_port=${12:-"1928"} # (可选) 主节点端口号 + # Added for distributed training + + # 以下为通用拼接log路径,无特殊可不用修改 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 切格式不要改动,与平台页面展示对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed + if [ ${profiling} = "true" ];then + add_options="profiler_options=/"batch_range=[50, 60]; profile_path=model.profile/"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + +} + +function _analysis_log(){ + python scripts/blip2/analysis_log.py ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item} ${run_process_type} +} + +function _train(){ + batch_size=${base_batch_size} # 如果模型跑多卡但进程时,请在_train函数中计算出多卡需要的bs + echo "current ${model_name} CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=${device_num}, batch_size=${batch_size}" + rm -rf lavis/outputs + + train_cmd="scripts/blip2/train.py \ + --cfg-path \ + lavis/projects/blip2/train/pretrain_stage2.yaml \ + --options \ + run.batch_size_train=${batch_size} + run.max_epoch=${max_epochs} + " + case ${run_process_type} in + SingleP) train_cmd="torchrun --nnodes 1 --nproc_per_node 1 ${train_cmd}" ;; + MultiP) + if [ ${device_num:3} = '32' ];then + train_cmd="torchrun --nnodes ${node_num} --nproc_per_node 8 --node_rank ${node_rank} --master_addr ${master_addr} --master_port ${master_port} ${train_cmd}" + else + train_cmd="torchrun --nnodes 1 --nproc_per_node 8 ${train_cmd}" + fi;; + *) echo "choose run_mode(SingleP or MultiP)"; exit 1; + esac + + timeout 30m ${train_cmd} > ${log_file} 2>&1 + rm -rf lavis/outputs + # 这个判断,无论是否成功都是0 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + export job_fail_flag=1 + else + echo -e "${model_name}, SUCCESS" + export job_fail_flag=0 + fi + } + # 注释掉,会异常退出 + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + #cd - + + +_set_params $@ +export frame_version=`python -c "import torch;print(torch.__version__)"` +echo "---------frame_version is torch ${frame_version}" +echo "---------model_branch is ${model_branch}" +echo "---------model_commit is ${model_commit}" +job_bt=`date '+%Y%m%d%H%M%S'` +_train +job_et=`date '+%Y%m%d%H%M%S'` +export model_run_time=$((${job_et}-${job_bt})) +_analysis_log \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/train.py b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/train.py new file mode 100644 index 0000000000..482231130e --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/train.py @@ -0,0 +1,104 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import argparse +import os +import random +import sys +# sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "models/LAVIS")) +import numpy as np +import torch +import torch.backends.cudnn as cudnn + +import lavis.tasks as tasks +from lavis.common.config import Config +from lavis.common.dist_utils import get_rank, init_distributed_mode +from lavis.common.logger import setup_logger +from lavis.common.optims import ( + LinearWarmupCosineLRScheduler, + LinearWarmupStepLRScheduler, +) +from lavis.common.registry import registry +from lavis.common.utils import now +from lavis.datasets.builders import * +from lavis.models import * +from lavis.processors import * +from lavis.runners import * +from lavis.tasks import * +import os +os.chdir('models/LAVIS') + +def parse_args(): + parser = argparse.ArgumentParser(description="Training") + + parser.add_argument("--cfg-path",default="lavis/projects/blip2/train/pretrain_stage2.yaml",help="path to configuration file.", + ) + parser.add_argument( + "--options", + nargs="+", + help="override some settings in the used config, the key-value pair " + "in xxx=yyy format will be merged into config file (deprecate), " + "change to --cfg-options instead.", + ) + + args = parser.parse_args() + # if 'LOCAL_RANK' not in os.environ: + # os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def setup_seeds(config): + seed = config.run_cfg.seed + get_rank() + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + cudnn.benchmark = False + cudnn.deterministic = True + + +def get_runner_class(cfg): + """ + Get runner class from config. Default to epoch-based runner. + """ + runner_cls = registry.get_runner_class(cfg.run_cfg.get("runner", "runner_base")) + + return runner_cls + + +def main(): + # allow auto-dl completes on main process without timeout when using NCCL backend. + # os.environ["NCCL_BLOCKING_WAIT"] = "1" + + # set before init_distributed_mode() to ensure the same job_id shared across all ranks. + job_id = now() + + cfg = Config(parse_args()) + + init_distributed_mode(cfg.run_cfg) + + setup_seeds(cfg) + + # set after init_distributed_mode() to only log on master. + setup_logger() + + cfg.pretty_print() + + task = tasks.setup_task(cfg) + datasets = task.build_datasets(cfg) + model = task.build_model(cfg) + + runner = get_runner_class(cfg)( + cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets + ) + runner.train() + + +if __name__ == "__main__": + main() From 871a0921664bd9f5c5750b6aa25970893561f975 Mon Sep 17 00:00:00 2001 From: wjm202 <897383984@qq.com> Date: Mon, 25 Sep 2023 14:14:01 +0800 Subject: [PATCH 02/16] add blip2 benchmark --- frame_benchmark/docker_images.yaml | 1 + frame_benchmark/models_path.yaml | 1 + ...sh => blip2-opt_pretrain_stage2_bs128_fp16_DP.sh} | 6 +++--- ....sh => blip2-opt_pretrain_stage2_bs32_fp16_DP.sh} | 12 ++++++------ ....sh => blip2-opt_pretrain_stage2_bs64_fp16_DP.sh} | 8 ++++---- ...sh => blip2-opt_pretrain_stage2_bs128_fp16_DP.sh} | 8 ++++---- ....sh => blip2-opt_pretrain_stage2_bs32_fp16_DP.sh} | 10 +++++----- ....sh => blip2-opt_pretrain_stage2_bs64_fp16_DP.sh} | 12 ++++++------ .../dynamic/PaddleMIX/scripts/blip2/prepare.sh | 2 +- 9 files changed, 31 insertions(+), 29 deletions(-) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt-2.7b-stage2_bs32_fp16_DP.sh => blip2-opt_pretrain_stage2_bs128_fp16_DP.sh} (93%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt-2.7b-stage2_bs128_fp16_DP.sh => blip2-opt_pretrain_stage2_bs32_fp16_DP.sh} (73%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt-2.7b-stage2_bs64_fp16_DP.sh => blip2-opt_pretrain_stage2_bs64_fp16_DP.sh} (76%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt-2.7b-stage2_bs32_fp16_DP.sh => blip2-opt_pretrain_stage2_bs128_fp16_DP.sh} (76%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt-2.7b-stage2_bs64_fp16_DP.sh => blip2-opt_pretrain_stage2_bs32_fp16_DP.sh} (74%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt-2.7b-stage2_bs128_fp16_DP.sh => blip2-opt_pretrain_stage2_bs64_fp16_DP.sh} (73%) diff --git a/frame_benchmark/docker_images.yaml b/frame_benchmark/docker_images.yaml index 95556ca3c2..4a95366442 100644 --- a/frame_benchmark/docker_images.yaml +++ b/frame_benchmark/docker_images.yaml @@ -109,3 +109,4 @@ pytorch: Cylinder_2d: iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.7-cudnn8-nccl2.12.12-python3.10 stable_diffusion: iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu20.04-gcc12.0-cuda12.0-cudnn8.9-nccl2.17.1-openmpi4.1.5 llama_pretrain: iregistry.baidu-int.com/paddle-benchmark/llama-megatron:cuda11.7-cudnn8.4.1-nccl2.12.12-torch1.13.1 + blip2: iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.8-cudnn8.6-nccl2.15.5 diff --git a/frame_benchmark/models_path.yaml b/frame_benchmark/models_path.yaml index 133308dc2e..dfaec8ef7d 100644 --- a/frame_benchmark/models_path.yaml +++ b/frame_benchmark/models_path.yaml @@ -109,3 +109,4 @@ pytorch: Cylinder_2d: benchmark/frame_benchmark/pytorch/dynamic/PaddleScience/models/cylinder_2d stable_diffusion: benchmark/frame_benchmark/pytorch/dynamic/PaddleMIX/models/diffusers llama_pretrain: benchmark/frame_benchmark/pytorch/dynamic/PaddleNLP/models/PaddleNLP_LLAMA_torch + blip2: benchmark/frame_benchmark/pytorch/dynamic/PaddleMIX/models/blip2 \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh similarity index 93% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh index f787fc8147..a86e96008e 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-2.7b-stage2 -bs_item=32 +model_item=blip2-opt-stage2 +bs_item=128 fp_item=fp16 run_process_type=SingleP run_mode=DP device_num=N1C1 -max_epochs=1 +max_epochs=20 num_workers=1 #get data diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh similarity index 73% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh index fe334f132b..26a9c76fe6 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-2.7b-stage2 -bs_item=128 +model_item=blip2-opt-stage2 +bs_item=32 fp_item=fp16 run_process_type=SingleP run_mode=DP device_num=N1C1 -max_epochs=10 +max_epochs=20 num_workers=1 -#get data -bash prepare.sh +# #get data +# bash scripts/blip2/prepare.sh #run -bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh similarity index 76% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh index de22a1d819..ad84ac2df7 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-2.7b-stage2 +model_item=blip2-opt-stage2 bs_item=64 fp_item=fp16 run_process_type=SingleP run_mode=DP device_num=N1C1 -max_epochs=10 +max_epochs=20 num_workers=1 #get data -bash prepare.sh +bash scripts/blip2/prepare.sh #run -bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh similarity index 76% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh index c84c5e17d1..b4ef725e6f 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-2.7b-stage2 +model_item=blip2-opt-stage2 bs_item=128 fp_item=fp16 run_process_type=MultiP run_mode=DP device_num=N1C1 -max_epochs=10 +max_epochs=20 num_workers=1 #get data -bash prepare.sh +bash scripts/blip2/prepare.sh #run -bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh similarity index 74% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh index 2bcd703603..e52ffa5d9f 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-2.7b-stage2 +model_item=blip2-opt-stage2 bs_item=32 -fp_item=fp32 +fp_item=fp16 run_process_type=MultiP run_mode=DP device_num=N1C1 -max_iter=10 +max_epochs=20 num_workers=1 #get data -bash prepare.sh +bash scripts/blip2/prepare.sh #run -bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh similarity index 73% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh index df9252b77d..05d074f7a9 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2.7b-stage2_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-2.7b-stage2 -bs_item=32 -fp_item=fp32 +model_item=blip2-opt-stage2 +bs_item=64 +fp_item=fp16 run_process_type=MultiP run_mode=DP device_num=N1C1 -max_epochs=10 +max_iter=20 num_workers=1 #get data -bash prepare.sh +bash scripts/blip2/prepare.sh #run -bash ../../LAVIS/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh index 7ef87e0503..ed76fdb193 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh @@ -7,4 +7,4 @@ mv coco /export/home/.cache/lavis/ rm -rf coco rm -rf coco.tar.gz # env -# pip install -r scripts/blip2/requirements.txt +pip install -r scripts/blip2/requirements.txt From e8ea4a63ad9926e1841acfd7275fdf6c9726221a Mon Sep 17 00:00:00 2001 From: wjm202 <897383984@qq.com> Date: Mon, 25 Sep 2023 14:59:03 +0800 Subject: [PATCH 03/16] change benchmark code --- .../blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh | 2 +- .../pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh | 2 +- .../pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh index 26a9c76fe6..abf3ee8ca6 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 # #get data -# bash scripts/blip2/prepare.sh +bash scripts/blip2/prepare.sh #run bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh index ed76fdb193..7ef87e0503 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh @@ -7,4 +7,4 @@ mv coco /export/home/.cache/lavis/ rm -rf coco rm -rf coco.tar.gz # env -pip install -r scripts/blip2/requirements.txt +# pip install -r scripts/blip2/requirements.txt diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh index 54bf19be94..5fd94d6bb0 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh @@ -60,7 +60,7 @@ function _train(){ --cfg-path \ lavis/projects/blip2/train/pretrain_stage2.yaml \ --options \ - run.batch_size_train=${batch_size} + run.batch_size_train=${batch_size} \ run.max_epoch=${max_epochs} " case ${run_process_type} in From 2ae09bde39e2750fa90bc894f0928c17332b8b26 Mon Sep 17 00:00:00 2001 From: wjm202 <897383984@qq.com> Date: Mon, 25 Sep 2023 17:03:53 +0800 Subject: [PATCH 04/16] cahnge name --- ...16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh} | 2 +- ...p16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh} | 2 +- ...p16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh} | 2 +- ...16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh} | 2 +- ...p16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh} | 2 +- ...p16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh} | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt_pretrain_stage2_bs128_fp16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh} (96%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt_pretrain_stage2_bs32_fp16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh} (96%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt_pretrain_stage2_bs64_fp16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh} (96%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt_pretrain_stage2_bs128_fp16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh} (96%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt_pretrain_stage2_bs32_fp16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh} (96%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt_pretrain_stage2_bs64_fp16_DP.sh => blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh} (96%) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh similarity index 96% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh index a86e96008e..d7ca92786f 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-stage2 +model_item=blip2-opt-2d7b_pretrain bs_item=128 fp_item=fp16 run_process_type=SingleP diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh similarity index 96% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh index abf3ee8ca6..c8024e0d10 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-stage2 +model_item=blip2-opt-2d7b_pretrain bs_item=32 fp_item=fp16 run_process_type=SingleP diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh similarity index 96% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh index ad84ac2df7..197e5256c2 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-stage2 +model_item=blip2-opt-2d7b_pretrain bs_item=64 fp_item=fp16 run_process_type=SingleP diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh similarity index 96% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh index b4ef725e6f..c3a8cfc2ef 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-stage2 +model_item=blip2-opt-2d7b_pretrain bs_item=128 fp_item=fp16 run_process_type=MultiP diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh similarity index 96% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh index e52ffa5d9f..d21a23e01b 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-stage2 +model_item=blip2-opt-2d7b_pretrain bs_item=32 fp_item=fp16 run_process_type=MultiP diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh similarity index 96% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh index 05d074f7a9..7ee2db891b 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt_pretrain_stage2_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -model_item=blip2-opt-stage2 +model_item=blip2-opt-2d7b_pretrain bs_item=64 fp_item=fp16 run_process_type=MultiP From 6e054c6b02d8cd51cf436ed1e0055e057b40abf6 Mon Sep 17 00:00:00 2001 From: wjm202 <897383984@qq.com> Date: Mon, 25 Sep 2023 17:11:55 +0800 Subject: [PATCH 05/16] change docker --- frame_benchmark/docker_images.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame_benchmark/docker_images.yaml b/frame_benchmark/docker_images.yaml index 05a44b8de1..31960624bd 100644 --- a/frame_benchmark/docker_images.yaml +++ b/frame_benchmark/docker_images.yaml @@ -109,5 +109,5 @@ pytorch: Cylinder_2d: iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.7-cudnn8-nccl2.12.12-python3.10 stable_diffusion: iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu20.04-gcc12.0-cuda12.0-cudnn8.9-nccl2.17.1-openmpi4.1.5 llama_pretrain: iregistry.baidu-int.com/paddle-benchmark/llama-megatron:cuda11.7-cudnn8.4.1-nccl2.12.12-torch1.13.1 - blip2: iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.8-cudnn8.6-nccl2.15.5 + blip2: iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu20.04-gcc12.0-cuda12.0-cudnn8.9-nccl2.17.1-openmpi4.1.5 llm: iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu20.04-gcc12.0-cuda12.0-cudnn8.9-nccl2.17.1-openmpi4.1.5 From 4cb5a4887df68b52a729ee2c41fcfb97ebe29d4e Mon Sep 17 00:00:00 2001 From: wjm202 <897383984@qq.com> Date: Mon, 25 Sep 2023 20:59:58 +0800 Subject: [PATCH 06/16] mv folder --- .../N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh | 4 ++-- .../blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh | 4 ++-- .../blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh | 4 ++-- .../N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh | 4 ++-- .../blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh | 4 ++-- .../blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh | 4 ++-- .../scripts/blip2/{ => benchmark_common}/analysis_log.py | 0 .../PaddleMIX/scripts/blip2/{ => benchmark_common}/prepare.sh | 0 .../scripts/blip2/{ => benchmark_common}/requirements.txt | 0 .../scripts/blip2/{ => benchmark_common}/run_benchmark.sh | 4 ++-- .../PaddleMIX/scripts/blip2/{ => benchmark_common}/train.py | 0 11 files changed, 14 insertions(+), 14 deletions(-) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/{ => benchmark_common}/analysis_log.py (100%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/{ => benchmark_common}/prepare.sh (100%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/{ => benchmark_common}/requirements.txt (100%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/{ => benchmark_common}/run_benchmark.sh (95%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/{ => benchmark_common}/train.py (100%) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh index d7ca92786f..d43b14cd89 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 #get data -bash scripts/blip2/prepare.sh +bash scripts/blip2/benchmark_common/prepare.sh #run -bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh index c8024e0d10..a6c142c767 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 # #get data -bash scripts/blip2/prepare.sh +bash scripts/blip2/benchmark_common/prepare.sh #run -bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh index 197e5256c2..1b8f7516a7 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 #get data -bash scripts/blip2/prepare.sh +bash scripts/blip2/benchmark_common/prepare.sh #run -bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh index c3a8cfc2ef..f85de0aec7 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 #get data -bash scripts/blip2/prepare.sh +bash scripts/blip2/benchmark_common/prepare.sh #run -bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh index d21a23e01b..d44b15a714 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 #get data -bash scripts/blip2/prepare.sh +bash scripts/blip2/benchmark_common/prepare.sh #run -bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh index 7ee2db891b..a43df72c68 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh @@ -22,6 +22,6 @@ max_iter=20 num_workers=1 #get data -bash scripts/blip2/prepare.sh +bash scripts/blip2/benchmark_common/prepare.sh #run -bash scripts/blip2/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/analysis_log.py b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/analysis_log.py similarity index 100% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/analysis_log.py rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/analysis_log.py diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh similarity index 100% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/prepare.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/requirements.txt b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/requirements.txt similarity index 100% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/requirements.txt rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/requirements.txt diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh similarity index 95% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh index 5fd94d6bb0..bd5d6a602b 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/run_benchmark.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh @@ -48,7 +48,7 @@ function _set_params(){ } function _analysis_log(){ - python scripts/blip2/analysis_log.py ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item} ${run_process_type} + python scripts/blip2/benchmark_common/analysis_log.py ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item} ${run_process_type} } function _train(){ @@ -56,7 +56,7 @@ function _train(){ echo "current ${model_name} CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=${device_num}, batch_size=${batch_size}" rm -rf lavis/outputs - train_cmd="scripts/blip2/train.py \ + train_cmd="scripts/blip2/benchmark_common/train.py \ --cfg-path \ lavis/projects/blip2/train/pretrain_stage2.yaml \ --options \ diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/train.py b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/train.py similarity index 100% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/train.py rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/train.py From 02a8347e499e283fbd6e39996692c849608da279 Mon Sep 17 00:00:00 2001 From: wjm202 <897383984@qq.com> Date: Tue, 26 Sep 2023 17:41:34 +0800 Subject: [PATCH 07/16] change name --- frame_benchmark/models_path.yaml | 2 +- ..._fp16_DP.sh => blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh} | 4 ++-- ...2_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh} | 6 +++--- ...4_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh} | 4 ++-- ..._fp16_DP.sh => blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh} | 4 ++-- ...2_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh} | 4 ++-- ...4_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh} | 4 ++-- .../PaddleMIX/scripts/blip2/benchmark_common/prepare.sh | 2 +- .../scripts/blip2/benchmark_common/run_benchmark.sh | 4 ++-- .../PaddleMIX/scripts/blip2/benchmark_common/train.py | 4 ++-- 10 files changed, 19 insertions(+), 19 deletions(-) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh} (78%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh} (76%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh} (78%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh} (78%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh} (78%) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/{blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh} (78%) diff --git a/frame_benchmark/models_path.yaml b/frame_benchmark/models_path.yaml index 1a098ad8d7..b11d1f13c1 100644 --- a/frame_benchmark/models_path.yaml +++ b/frame_benchmark/models_path.yaml @@ -109,5 +109,5 @@ pytorch: Cylinder_2d: benchmark/frame_benchmark/pytorch/dynamic/PaddleScience/models/cylinder_2d stable_diffusion: benchmark/frame_benchmark/pytorch/dynamic/PaddleMIX/models/diffusers llama_pretrain: benchmark/frame_benchmark/pytorch/dynamic/PaddleNLP/models/PaddleNLP_LLAMA_torch - blip2: benchmark/frame_benchmark/pytorch/dynamic/PaddleMIX/models/blip2 + blip2: benchmark/frame_benchmark/pytorch/dynamic/PaddleMIX/models/LAVIS llm: benchmark/frame_benchmark/pytorch/dynamic/PaddleNLP/models/transformers diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh similarity index 78% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh index d43b14cd89..de1cfbc655 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 #get data -bash scripts/blip2/benchmark_common/prepare.sh +bash prepare.sh #run -bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh similarity index 76% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh index a6c142c767..d94c14c0ff 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh @@ -21,7 +21,7 @@ device_num=N1C1 max_epochs=20 num_workers=1 -# #get data -bash scripts/blip2/benchmark_common/prepare.sh +#get data +bash prepare.sh #run -bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh similarity index 78% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh index 1b8f7516a7..1ed8ad4420 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 #get data -bash scripts/blip2/benchmark_common/prepare.sh +bash prepare.sh #run -bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh similarity index 78% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh index f85de0aec7..d6280d297c 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 #get data -bash scripts/blip2/benchmark_common/prepare.sh +bash prepare.sh #run -bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh similarity index 78% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh index d44b15a714..573de1684e 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh @@ -22,6 +22,6 @@ max_epochs=20 num_workers=1 #get data -bash scripts/blip2/benchmark_common/prepare.sh +bash prepare.sh #run -bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh similarity index 78% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh index a43df72c68..815125ce7d 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_stage2_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh @@ -22,6 +22,6 @@ max_iter=20 num_workers=1 #get data -bash scripts/blip2/benchmark_common/prepare.sh +bash prepare.sh #run -bash scripts/blip2/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh index 7ef87e0503..17a455f063 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh @@ -7,4 +7,4 @@ mv coco /export/home/.cache/lavis/ rm -rf coco rm -rf coco.tar.gz # env -# pip install -r scripts/blip2/requirements.txt +pip install -r requirements.txt diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh index bd5d6a602b..b9e2bd1d37 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh @@ -48,7 +48,7 @@ function _set_params(){ } function _analysis_log(){ - python scripts/blip2/benchmark_common/analysis_log.py ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item} ${run_process_type} + python analysis_log.py ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item} ${run_process_type} } function _train(){ @@ -56,7 +56,7 @@ function _train(){ echo "current ${model_name} CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=${device_num}, batch_size=${batch_size}" rm -rf lavis/outputs - train_cmd="scripts/blip2/benchmark_common/train.py \ + train_cmd="train.py \ --cfg-path \ lavis/projects/blip2/train/pretrain_stage2.yaml \ --options \ diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/train.py b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/train.py index 482231130e..3ea3150f92 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/train.py +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/train.py @@ -29,8 +29,8 @@ from lavis.processors import * from lavis.runners import * from lavis.tasks import * -import os -os.chdir('models/LAVIS') +# import os +# os.chdir('models/LAVIS') def parse_args(): parser = argparse.ArgumentParser(description="Training") From e0b5b1470056260af6edca3639344ea5f752de76 Mon Sep 17 00:00:00 2001 From: wjm202 <897383984@qq.com> Date: Thu, 28 Sep 2023 10:33:00 +0800 Subject: [PATCH 08/16] add proxy --- .../scripts/blip2/benchmark_common/run_benchmark.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh index b9e2bd1d37..6e7e331015 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh @@ -10,7 +10,7 @@ function _set_params(){ run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C8 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 - model_repo="blip2" # (必选) 模型套件的名字 + model_repo="LAVIS" # (必选) 模型套件的名字 speed_unit="sample/sec" # (必选)速度指标单位 skip_steps=4 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 @@ -91,6 +91,11 @@ function _train(){ _set_params $@ +echo "https_proxy $HTTPS_PRO" +echo "http_proxy $HTTP_PRO" +export https_proxy=$HTTPS_PRO +export http_proxy=$HTTP_PRO +export no_proxy=localhost,bj.bcebos.com,su.bcebos.com export frame_version=`python -c "import torch;print(torch.__version__)"` echo "---------frame_version is torch ${frame_version}" echo "---------model_branch is ${model_branch}" From 124559cc350016e25a9bab40884448a5764aeb1a Mon Sep 17 00:00:00 2001 From: wjm <897383984@qq.com> Date: Mon, 9 Oct 2023 19:23:07 +0800 Subject: [PATCH 09/16] Update prepare.sh --- .../dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh index 17a455f063..edcbbce8ab 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/prepare.sh @@ -1,4 +1,4 @@ - +mkdir -p /export/home/.cache/lavis/coco/ rm -rf /export/home/.cache/lavis/coco/ # dataset wget https://paddlenlp.bj.bcebos.com/models/community/paddlemix/benchmark/blip2/coco.tar.gz From 6a310d01265e0eff088d8368a8328e1dd71fdcd0 Mon Sep 17 00:00:00 2001 From: wjm <897383984@qq.com> Date: Mon, 9 Oct 2023 19:27:14 +0800 Subject: [PATCH 10/16] Update requirements.txt --- .../PaddleMIX/scripts/blip2/benchmark_common/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/requirements.txt b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/requirements.txt index 9db1dc9308..5eb2a8f1ca 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/requirements.txt +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/requirements.txt @@ -21,8 +21,8 @@ sentencepiece spacy streamlit timm==0.4.12 -torch>=1.10.0 -torchvision +torch==1.13.1 +torchvision==0.14.1 tqdm transformers>=4.28.0 webdataset From 2f27d567a729b666705377f6e5fa1b4f7c20537a Mon Sep 17 00:00:00 2001 From: wjm <897383984@qq.com> Date: Mon, 9 Oct 2023 19:37:05 +0800 Subject: [PATCH 11/16] Update blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh --- .../blip2/N1C8/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh index d6280d297c..46ba7a1bde 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh @@ -17,11 +17,11 @@ bs_item=128 fp_item=fp16 run_process_type=MultiP run_mode=DP -device_num=N1C1 +device_num=N1C8 max_epochs=20 num_workers=1 #get data bash prepare.sh #run -bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; From 0a190ba1940c968994b984e8bff68cde920171c3 Mon Sep 17 00:00:00 2001 From: wjm <897383984@qq.com> Date: Mon, 9 Oct 2023 19:37:25 +0800 Subject: [PATCH 12/16] Update blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh --- .../blip2/N1C8/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh index 573de1684e..3c915a6af1 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs32_fp16_DP.sh @@ -17,11 +17,11 @@ bs_item=32 fp_item=fp16 run_process_type=MultiP run_mode=DP -device_num=N1C1 +device_num=N1C8 max_epochs=20 num_workers=1 #get data bash prepare.sh #run -bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; From 200069d5813cdbbc4272b623c5805e01627a8c25 Mon Sep 17 00:00:00 2001 From: wjm <897383984@qq.com> Date: Mon, 9 Oct 2023 19:37:36 +0800 Subject: [PATCH 13/16] Update blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh --- .../blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh index 815125ce7d..ee51413436 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh @@ -17,11 +17,11 @@ bs_item=64 fp_item=fp16 run_process_type=MultiP run_mode=DP -device_num=N1C1 +device_num=N1C8 max_iter=20 num_workers=1 #get data bash prepare.sh #run -bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; From cd11365bbd39a7a178c3e6259900e6d06cfc6338 Mon Sep 17 00:00:00 2001 From: wjm <897383984@qq.com> Date: Wed, 11 Oct 2023 17:47:22 +0800 Subject: [PATCH 14/16] Update run_benchmark.sh --- .../PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh index 6e7e331015..f8b9fc4e35 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/benchmark_common/run_benchmark.sh @@ -15,7 +15,7 @@ function _set_params(){ skip_steps=4 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" - max_epochs=${7:-"1"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件 或是max_epoch + max_epochs=${7:-"10"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件 或是max_epoch num_workers=${8:-"3"} # (可选) # Added for distributed training @@ -104,4 +104,4 @@ job_bt=`date '+%Y%m%d%H%M%S'` _train job_et=`date '+%Y%m%d%H%M%S'` export model_run_time=$((${job_et}-${job_bt})) -_analysis_log \ No newline at end of file +_analysis_log From c1b63214f10b1facf2b747ea13ced4d5c41f642c Mon Sep 17 00:00:00 2001 From: wjm <897383984@qq.com> Date: Wed, 11 Oct 2023 17:48:15 +0800 Subject: [PATCH 15/16] Update blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh --- .../scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh index ee51413436..98dcfe5152 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C8/blip2-opt-2d7b_pretrain_bs64_fp16_DP.sh @@ -18,7 +18,7 @@ fp_item=fp16 run_process_type=MultiP run_mode=DP device_num=N1C8 -max_iter=20 +max_epochs=20 num_workers=1 #get data From 70f18de44ff08af29de86dfb83a6f4d15d17172d Mon Sep 17 00:00:00 2001 From: wjm <897383984@qq.com> Date: Thu, 12 Oct 2023 14:05:36 +0800 Subject: [PATCH 16/16] Update and rename blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh to blip2-opt-2d7b_pretrain_bs16_fp16_DP.sh --- ...128_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs16_fp16_DP.sh} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/{blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh => blip2-opt-2d7b_pretrain_bs16_fp16_DP.sh} (97%) diff --git a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs16_fp16_DP.sh similarity index 97% rename from frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh rename to frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs16_fp16_DP.sh index de1cfbc655..f80f3ceb09 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs128_fp16_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleMIX/scripts/blip2/N1C1/blip2-opt-2d7b_pretrain_bs16_fp16_DP.sh @@ -13,7 +13,7 @@ # limitations under the License. model_item=blip2-opt-2d7b_pretrain -bs_item=128 +bs_item=16 fp_item=fp16 run_process_type=SingleP run_mode=DP @@ -24,4 +24,4 @@ num_workers=1 #get data bash prepare.sh #run -bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; \ No newline at end of file +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;