Skip to content

Commit

Permalink
Fix some merge errors
Browse files Browse the repository at this point in the history
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
  • Loading branch information
fegin committed Feb 2, 2024
1 parent 017680f commit 0541be2
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 14 deletions.
7 changes: 3 additions & 4 deletions run_llama_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@ MODEL="debugmodel"
NGPU=8
MP=4
# Change this string to a meaningful one to enable checkpoint
CHECKPOINT_FOLDER="/tmp/chienchin"
CHECKPOINT_FOLDER=""
# Please adjust this to a longer interval period. The unit of measurement is in steps.
CHECKPOINT_INTERVAL=2
CHECKPOINT_INTERVAL_TYPE="seconds"
CHECKPOINT_INTERVAL=5

torchrun --nproc_per_node=${NGPU} \
train.py --steps 10 \
train.py --steps 10 --compile \
--checkpoint-folder=${CHECKPOINT_FOLDER} --checkpoint-interval=${CHECKPOINT_INTERVAL}
7 changes: 1 addition & 6 deletions torchtrain/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,8 @@ def save(self, curr_step: int, force: bool = False) -> None:
rank0_log(f"Saving a checkpoint in step {curr_step}.")
begin = time.monotonic()
dcp.save(self.states, checkpoint_id=self.create_checkpoint_id(curr_step))
<<<<<<< HEAD
self.reset(curr_step)
rank0_log(
=======
self.reset()
logging.warning(
>>>>>>> a0257bc (Simplify the code and use steps instead of seconds as the default unit of measurement.)
rank0_log(
f"Finish saving the checkpoint in step {curr_step}. "
f"{time.monotonic() - begin} seconds"
)
Expand Down
5 changes: 1 addition & 4 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,7 @@ def main(args):
)
scheduler.step()

checkpoint.save(train_state.step)

if train_state.step == args.steps:
checkpoint.save(train_state.step, force=True)
checkpoint.save(train_state.step, force=(train_state.step == args.steps))


if __name__ == "__main__":
Expand Down

0 comments on commit 0541be2

Please sign in to comment.