Improve run_llama_train.sh args and add local-ranks-filter

pytorch · Feb 10, 2024 · 9b9ee1f · 9b9ee1f
1 parent e1b61c3
commit 9b9ee1f
Showing 1 changed file with 17 additions and 5 deletions.
diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -4,14 +4,26 @@ set -ex
 
 TRAINER_DIR=${1:-/home/$USER/local/torchtrain}
 
-MODEL="debugmodel"
-NGPU=8
-MP=4
+# use envs as local overrides for convenience
+# e.g.
+# LOG_RANK=0,1 NGPU=4 SP=2 ./run_llama_train.sh
+
+MODEL=${MODEL:-"debugmodel"}
+NGPU=${NGPU:-"8"}
+PP=${PP:-"1"}
+SP=${SP:-"1"}
+DP=${DP:-"-1"}
+
+# by default log just rank 0 output,
+LOG_RANK=${LOG_RANK:-0}
+
 # Change this string to a meaningful one to enable checkpoint
-CHECKPOINT_FOLDER=""
+CHECKPOINT_FOLDER=${CHECKPOINT_FOLDER:-""}
 # Please adjust this to a longer interval period. The unit of measurement is in steps.
-CHECKPOINT_INTERVAL=5
+CHECKPOINT_INTERVAL=${CHECKPOINT_INTERVAL:-5}
 
 torchrun --nproc_per_node=${NGPU} \
+--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
 train.py --steps 10 --compile \
+--pp_degree ${PP} --sp_degree ${SP} --dp_degree ${DP}
 --checkpoint-folder=${CHECKPOINT_FOLDER} --checkpoint-interval=${CHECKPOINT_INTERVAL}