From 271479fbb2502d0e7fdb63698aea4c6d659d9105 Mon Sep 17 00:00:00 2001 From: whenwen Date: Thu, 12 Dec 2024 07:17:33 +0000 Subject: [PATCH] The training succeed and the model seems to be properly saved but have error in loading model. Example wandb link: https://wandb.ai/understanding-sam/levanter/runs/pdi0vc3w?nw=nwuserwhen --- error_loading_model.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 error_loading_model.sh diff --git a/error_loading_model.sh b/error_loading_model.sh new file mode 100644 index 000000000..8a555be70 --- /dev/null +++ b/error_loading_model.sh @@ -0,0 +1,10 @@ +eval $(ssh-agent -s) +bash infra/babysit-tpu-vm.sh muon-debug -z us-central2-b -t v4-128 --preemptible -- \ +WANDB_API_KEY=[WANDB_API_KEY] \ +bash levanter/infra/run.sh python \ +levanter/src/levanter/main/train_lm.py \ +--config_path levanter/config/llama2_100M_muon.yaml \ +--trainer.checkpointer.base_path gs://marin-us-central2/scratch/kaiyue/checkpoints/muon/llama2_100M_constant \ +--optimizer.type muon \ +--trainer.num_train_steps 10000 \ +--trainer.load_checkpoint_path gs://marin-us-central2/scratch/kaiyue/checkpoints/muon/llama2_100M_constant/tjo9vxfb/step-4000