From 9e372d3433b61fba00caa507cf07afa43a4b7566 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Sat, 7 Sep 2024 09:38:37 -0700 Subject: [PATCH] [NeMo-UX] Checkpointing fixes (#10376) * remove save_best_model from default logger Signed-off-by: ashors1 * fix broken checkpoint restore Signed-off-by: ashors1 * fix fsdp Signed-off-by: ashors1 * rename weights path to avoid confusion Signed-off-by: ashors1 * Revert "rename weights path to avoid confusion". We'll add this in a separate PR This reverts commit 72bae8bdf4dd7444d549cdcc1ed48ac5fb33c0de. --------- Signed-off-by: ashors1 --- nemo/collections/llm/recipes/log/default.py | 1 - nemo/lightning/pytorch/strategies/fsdp_strategy.py | 2 +- nemo/lightning/pytorch/strategies/megatron_strategy.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py index 4d5e9223b535..94e595bdb811 100644 --- a/nemo/collections/llm/recipes/log/default.py +++ b/nemo/collections/llm/recipes/log/default.py @@ -32,7 +32,6 @@ def default_log( ) -> Config[nl.NeMoLogger]: ckpt = Config( nl.ModelCheckpoint, - save_best_model=False, save_last=True, save_top_k=10, every_n_train_steps=200, diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py index 2a210c9bd7f0..d34d1716e6b4 100644 --- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -216,7 +216,7 @@ def save_checkpoint( and self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_save_optimizer ): - del checkpoint["optimizer_states"] + checkpoint["optimizer_states"] = {} checkpoint['optimizer'] = get_optimizer_state_dict(self.model, self.optimizers) pyt_to_mcore_state_dict(checkpoint['optimizer']['state'], prefix="optimizer.state.") diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index 4bf8c42ece02..3a0a0368bcef 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -634,7 +634,7 @@ def save_checkpoint( and self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_save_optimizer ): - del checkpoint["optimizer_states"] + checkpoint["optimizer_states"] = {} checkpoint["optimizer"] = [self.optimizer_sharded_state_dict()] self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)