Skip to content

Commit

Permalink
Improve run_llama_train.sh args and add local-ranks-filter
Browse files Browse the repository at this point in the history
  • Loading branch information
wconstab committed Feb 10, 2024
1 parent e1b61c3 commit 9b9ee1f
Showing 1 changed file with 17 additions and 5 deletions.
22 changes: 17 additions & 5 deletions run_llama_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,26 @@ set -ex

TRAINER_DIR=${1:-/home/$USER/local/torchtrain}

MODEL="debugmodel"
NGPU=8
MP=4
# use envs as local overrides for convenience
# e.g.
# LOG_RANK=0,1 NGPU=4 SP=2 ./run_llama_train.sh

MODEL=${MODEL:-"debugmodel"}
NGPU=${NGPU:-"8"}
PP=${PP:-"1"}
SP=${SP:-"1"}
DP=${DP:-"-1"}

# by default log just rank 0 output,
LOG_RANK=${LOG_RANK:-0}

# Change this string to a meaningful one to enable checkpoint
CHECKPOINT_FOLDER=""
CHECKPOINT_FOLDER=${CHECKPOINT_FOLDER:-""}
# Please adjust this to a longer interval period. The unit of measurement is in steps.
CHECKPOINT_INTERVAL=5
CHECKPOINT_INTERVAL=${CHECKPOINT_INTERVAL:-5}

torchrun --nproc_per_node=${NGPU} \
--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
train.py --steps 10 --compile \
--pp_degree ${PP} --sp_degree ${SP} --dp_degree ${DP}
--checkpoint-folder=${CHECKPOINT_FOLDER} --checkpoint-interval=${CHECKPOINT_INTERVAL}

0 comments on commit 9b9ee1f

Please sign in to comment.