Add vlm nemo run scripts (#11394)

* update recipe Signed-off-by: yaoyu-33 <[email protected]> * fix mllama mock ds Signed-off-by: yaoyu-33 <[email protected]> * update to use attention bias Signed-off-by: yaoyu-33 <[email protected]> * remove example Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring mock.py Signed-off-by: yaoyu-33 <[email protected]> * fix docstring language.py Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring language.py Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring mllama/base.py Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring mllama/language.py Signed-off-by: yaoyu-33 <[email protected]> * bump mcore Signed-off-by: Oliver Koenig <[email protected]> * Add scripts for mllama Signed-off-by: yaoyu-33 <[email protected]> * fix Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * update script Signed-off-by: yaoyu-33 <[email protected]> * fix pylint Signed-off-by: yaoyu-33 <[email protected]> * revert Dockerfile.ci Signed-off-by: Yu Yao <[email protected]> * add scripts Signed-off-by: yaoyu-33 <[email protected]> * add vlm training test in ci Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring issues Signed-off-by: yaoyu-33 <[email protected]> * update script match recipe Signed-off-by: yaoyu-33 <[email protected]> * update recipes Signed-off-by: yaoyu-33 <[email protected]> * Update mllama_train.py Signed-off-by: Yu Yao <[email protected]> * update mllama 90b recipe Signed-off-by: yaoyu-33 <[email protected]> * update to use tmp in ci tests Signed-off-by: yaoyu-33 <[email protected]> * update default llava config Signed-off-by: yaoyu-33 <[email protected]> * add nemo run scripts Signed-off-by: yaoyu-33 <[email protected]> * fix vpp issue Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix cicd Signed-off-by: yaoyu-33 <[email protected]> * fix cicd Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * remove duplicated script Signed-off-by: yaoyu-33 <[email protected]> * ci: Add HF cache Signed-off-by: oliver könig <[email protected]> * update to use SP in recipe Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix Signed-off-by: yaoyu-33 <[email protected]> * upgrade Signed-off-by: yaoyu-33 <[email protected]> * Revert "upgrade" This reverts commit f6ad2cd. * update neva api Signed-off-by: yaoyu-33 <[email protected]> * update neva api Signed-off-by: yaoyu-33 <[email protected]> * fix neva processing Signed-off-by: yaoyu-33 <[email protected]> * fix lint Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix data fields Signed-off-by: yaoyu-33 <[email protected]> * few fixes Signed-off-by: yaoyu-33 <[email protected]> --------- Signed-off-by: yaoyu-33 <[email protected]> Signed-off-by: yaoyu-33 <[email protected]> Signed-off-by: Oliver Koenig <[email protected]> Signed-off-by: Yu Yao <[email protected]> Signed-off-by: oliver könig <[email protected]> Co-authored-by: yaoyu-33 <[email protected]> Co-authored-by: Oliver Koenig <[email protected]>
NVIDIA · Dec 11, 2024 · 1dd53c3 · 1dd53c3
1 parent d31653f
commit 1dd53c3
Show file tree

Hide file tree

Showing 11 changed files with 157 additions and 9 deletions.
diff --git a/nemo/collections/multimodal/data/energon/base.py b/nemo/collections/multimodal/data/energon/base.py
@@ -92,6 +92,8 @@ def __init__(
         self.decoder_seq_length = decoder_seq_length
         self.micro_batch_size = micro_batch_size
         self.global_batch_size = global_batch_size
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
         self.num_workers = num_workers
         self.pin_memory = pin_memory
         self.multimodal_sample_config = multimodal_sample_config

diff --git a/nemo/collections/vlm/mllama/data/lazy.py b/nemo/collections/vlm/mllama/data/lazy.py
@@ -205,6 +205,8 @@ def __init__(
         self.data_config = data_config
         self.seq_length = seq_length
         self.decoder_seq_length = decoder_seq_length
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
         self.tokenizer = tokenizer
         self.image_processor = image_processor
         self.num_train_samples = num_train_samples

diff --git a/nemo/collections/vlm/mllama/data/mock.py b/nemo/collections/vlm/mllama/data/mock.py
@@ -66,6 +66,8 @@ def __init__(
         super().__init__()
         self.seq_length = seq_length
         self.decoder_seq_length = decoder_seq_length
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
         self.num_train_samples = num_train_samples
         self.num_val_samples = num_val_samples
         self.num_test_samples = num_test_samples

diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py
@@ -442,8 +442,6 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         if media_type == 'image':
             media = [instance.pop('image') for instance in instances]
             media = torch.cat(media, dim=0)
-            if media.size(0) == 0:
-                media = None
         elif media_type == 'video':
             media = [instance.pop('video', None) for instance in instances]
         else:
@@ -525,6 +523,8 @@ def __init__(
         self.data_config = data_config
         self.seq_length = seq_length
         self.decoder_seq_length = decoder_seq_length
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
         self.tokenizer = tokenizer
         self.image_processor = image_processor
         self.num_train_samples = num_train_samples

diff --git a/nemo/collections/vlm/neva/data/mock.py b/nemo/collections/vlm/neva/data/mock.py
@@ -46,6 +46,8 @@ def __init__(
         super().__init__()
         self.seq_length = seq_length
         self.decoder_seq_len = decoder_seq_length
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
         self.num_train_samples = num_train_samples
         self.num_val_samples = num_val_samples
         self.num_test_samples = num_test_samples

diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py
@@ -320,6 +320,7 @@ def configure_model(self, tokenizer) -> "MCoreNevaModel":
         self.language_transformer_config.pipeline_model_parallel_size = self.pipeline_model_parallel_size
         self.language_transformer_config.context_parallel_size = self.context_parallel_size
 
+        assert "NEVA `encoder_pipeline_model_parallel_size` has bug for now. Fix will come soon."
         if self.encoder_pipeline_model_parallel_size > 0:
             assert self.encoder_pipeline_model_parallel_size == 1, "ViT can only live on 1 pipeline stage."
             self.vision_transformer_config.pipeline_model_parallel_size = self.encoder_pipeline_model_parallel_size
@@ -334,8 +335,7 @@ def configure_model(self, tokenizer) -> "MCoreNevaModel":
         model = MCoreNevaModel(
             config=self,
             tokenizer=tokenizer,
-            pre_process=ps.is_pipeline_first_stage()
-            or ps.get_pipeline_model_parallel_rank() == self.encoder_pipeline_model_parallel_size,
+            pre_process=ps.is_pipeline_first_stage(),
             post_process=ps.is_pipeline_last_stage(),
             add_encoder=ps.is_pipeline_first_stage(),
             add_decoder=ps.is_pipeline_last_stage()
@@ -488,17 +488,19 @@ def forward(
         use_inference_kv_cache = (
             inference_params is not None and "image_tokens_count" in inference_params.key_value_memory_dict
         )
-        has_images = media.shape[0] > 0
+        has_images = media is not None and media.shape[0] > 0
 
         # If running inference, we can skip media token computation if they were computed already earlier for this sample.
         if use_inference_kv_cache:
             media_embeddings = None
         elif self.add_encoder and not has_images:
+            vision_param = next(self.vision_model.parameters())
             # If no images provided, use an empty image embeddings tensor.
-            media_embeddings = torch.tensor([], dtype=media.dtype, device=media.device).reshape(0, 0, 0)
+            media_embeddings = torch.tensor([], dtype=vision_param.dtype, device=vision_param.device).reshape(0, 0, 0)
         elif self.add_encoder and has_images:
             # media is in shape of (num_images_in_mbs, c, h, w)
             # note num_images_in_mbs is not mbs but total images in this mbs.
+            media = media.to(next(self.vision_model.parameters()).dtype)
             if self.vision_model_from_hf:
                 self.vision_model = self.vision_model.eval()
                 media_embeddings = self.vision_model(media, output_hidden_states=True)
@@ -507,7 +509,6 @@ def forward(
                 ]  # [num_images, img_seq_len, h_vision]
             else:
                 # TODO(yuya): MCore Clip path not yet support taking a specific layer hidden states
-                media = media.to(next(self.vision_model.parameters()).dtype)
                 media_embeddings = self.vision_model(media, num_unused_layers=-self.config.vision_feature_layer - 1)
             if self._drop_vision_class_token:
                 class_token_len = getattr(self.vision_model, "class_token_len", 1)

diff --git a/nemo/collections/vlm/recipes/llava15_13b.py b/nemo/collections/vlm/recipes/llava15_13b.py
@@ -18,6 +18,7 @@
 import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 
 from nemo import lightning as nl
 from nemo.collections import llm, vlm
@@ -26,6 +27,8 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.collections.vlm.neva.data.mock import MockDataModule
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llava15_13b"
 
@@ -92,7 +95,16 @@ def finetune_recipe(
         tensor_model_parallel_size=1,
         pipeline_model_parallel_size=1,
         encoder_pipeline_model_parallel_size=0,
+        sequence_parallel=True,
         pipeline_dtype=torch.bfloat16,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
     )
 
     trainer = run.Config(
@@ -107,6 +119,10 @@ def finetune_recipe(
         plugins=bf16_mixed(),
         strategy=strategy,
         val_check_interval=1000,
+        callbacks=[
+            run.Config(TimingCallback),
+            run.Config(MegatronCommOverlapCallback, tp_comm_overlap=True),
+        ],
     )
 
     recipe = run.Partial(

diff --git a/nemo/collections/vlm/recipes/llava15_7b.py b/nemo/collections/vlm/recipes/llava15_7b.py
@@ -18,6 +18,7 @@
 import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 
 from nemo import lightning as nl
 from nemo.collections import llm, vlm
@@ -26,6 +27,7 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.collections.vlm.neva.data.mock import MockDataModule
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llava15_7b"
@@ -93,7 +95,16 @@ def finetune_recipe(
         tensor_model_parallel_size=1,
         pipeline_model_parallel_size=1,
         encoder_pipeline_model_parallel_size=0,
+        sequence_parallel=True,
         pipeline_dtype=torch.bfloat16,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
     )
 
     trainer = run.Config(
@@ -108,7 +119,10 @@ def finetune_recipe(
         plugins=bf16_mixed(),
         strategy=strategy,
         val_check_interval=1000,
-        callbacks=[run.Config(TimingCallback)],
+        callbacks=[
+            run.Config(TimingCallback),
+            run.Config(MegatronCommOverlapCallback, tp_comm_overlap=True),
+        ],
     )
 
     recipe = run.Partial(

diff --git a/scripts/vlm/mllama_nemo_run.py b/scripts/vlm/mllama_nemo_run.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nemo_run as run
+
+from nemo.collections import vlm
+
+
+def configure_recipe(nodes: int = 1, gpus_per_node: int = 1):
+    # pylint: disable=C0115,C0116
+    recipe = vlm.mllama_11b.finetune_recipe(
+        dir="/checkpoints/mllama",  # Path to store checkpoints
+        name="mllama",
+        num_nodes=nodes,
+        num_gpus_per_node=gpus_per_node,
+        peft_scheme="lora",
+    )
+    recipe.trainer.max_steps = 100
+    recipe.trainer.val_check_interval = 100
+    return recipe
+
+
+def local_executor_torchrun(nodes: int = 1, devices: int = 1) -> run.LocalExecutor:
+    # pylint: disable=C0115,C0116
+    # Env vars for jobs are configured here
+    env_vars = {}
+
+    executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
+
+    return executor
+
+
+def run_training():
+    # pylint: disable=C0115,C0116
+    recipe = configure_recipe()
+    executor = local_executor_torchrun(nodes=recipe.trainer.num_nodes, devices=recipe.trainer.devices)
+
+    run.run(recipe, executor=executor)
+
+
+# This condition is necessary for the script to be compatible with Python's multiprocessing module.
+if __name__ == "__main__":
+    run_training()
diff --git a/scripts/vlm/neva_finetune.py b/scripts/vlm/neva_finetune.py
@@ -111,7 +111,7 @@ def main(args):
         ddp=DistributedDataParallelConfig(
             check_for_nan_in_grad=True,
             grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
+            overlap_grad_reduce=False,
             overlap_param_gather=True,
             average_in_collective=True,
         ),

diff --git a/scripts/vlm/neva_nemo_run.py b/scripts/vlm/neva_nemo_run.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nemo_run as run
+
+from nemo.collections import vlm
+
+
+def configure_recipe(nodes: int = 1, gpus_per_node: int = 8):
+    # pylint: disable=C0115,C0116
+    recipe = vlm.llava15_7b.finetune_recipe(
+        dir="/checkpoints/llava",  # Path to store checkpoints
+        name="llava_ft",
+        num_nodes=nodes,
+        num_gpus_per_node=gpus_per_node,
+        peft_scheme="none",
+    )
+    recipe.trainer.max_steps = 100
+    recipe.trainer.val_check_interval = 100
+    recipe.model.config.freeze_vision_model = True
+    return recipe
+
+
+def local_executor_torchrun(nodes: int = 1, devices: int = 8) -> run.LocalExecutor:
+    # pylint: disable=C0115,C0116
+    # Env vars for jobs are configured here
+    env_vars = {}
+
+    executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
+
+    return executor
+
+
+def run_training():
+    # pylint: disable=C0115,C0116
+    recipe = configure_recipe()
+    executor = local_executor_torchrun(nodes=recipe.trainer.num_nodes, devices=recipe.trainer.devices)
+
+    run.run(recipe, executor=executor)
+
+
+# This condition is necessary for the script to be compatible with Python's multiprocessing module.
+if __name__ == "__main__":
+    run_training()