-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* update recipe Signed-off-by: yaoyu-33 <[email protected]> * fix mllama mock ds Signed-off-by: yaoyu-33 <[email protected]> * update to use attention bias Signed-off-by: yaoyu-33 <[email protected]> * remove example Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring mock.py Signed-off-by: yaoyu-33 <[email protected]> * fix docstring language.py Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring language.py Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring mllama/base.py Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring mllama/language.py Signed-off-by: yaoyu-33 <[email protected]> * bump mcore Signed-off-by: Oliver Koenig <[email protected]> * Add scripts for mllama Signed-off-by: yaoyu-33 <[email protected]> * fix Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * update script Signed-off-by: yaoyu-33 <[email protected]> * fix pylint Signed-off-by: yaoyu-33 <[email protected]> * revert Dockerfile.ci Signed-off-by: Yu Yao <[email protected]> * add scripts Signed-off-by: yaoyu-33 <[email protected]> * add vlm training test in ci Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix docstring issues Signed-off-by: yaoyu-33 <[email protected]> * update script match recipe Signed-off-by: yaoyu-33 <[email protected]> * update recipes Signed-off-by: yaoyu-33 <[email protected]> * Update mllama_train.py Signed-off-by: Yu Yao <[email protected]> * update mllama 90b recipe Signed-off-by: yaoyu-33 <[email protected]> * update to use tmp in ci tests Signed-off-by: yaoyu-33 <[email protected]> * update default llava config Signed-off-by: yaoyu-33 <[email protected]> * add nemo run scripts Signed-off-by: yaoyu-33 <[email protected]> * fix vpp issue Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix cicd Signed-off-by: yaoyu-33 <[email protected]> * fix cicd Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * remove duplicated script Signed-off-by: yaoyu-33 <[email protected]> * ci: Add HF cache Signed-off-by: oliver könig <[email protected]> * update to use SP in recipe Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix Signed-off-by: yaoyu-33 <[email protected]> * upgrade Signed-off-by: yaoyu-33 <[email protected]> * Revert "upgrade" This reverts commit f6ad2cd. * update neva api Signed-off-by: yaoyu-33 <[email protected]> * update neva api Signed-off-by: yaoyu-33 <[email protected]> * fix neva processing Signed-off-by: yaoyu-33 <[email protected]> * fix lint Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * fix data fields Signed-off-by: yaoyu-33 <[email protected]> * few fixes Signed-off-by: yaoyu-33 <[email protected]> --------- Signed-off-by: yaoyu-33 <[email protected]> Signed-off-by: yaoyu-33 <[email protected]> Signed-off-by: Oliver Koenig <[email protected]> Signed-off-by: Yu Yao <[email protected]> Signed-off-by: oliver könig <[email protected]> Co-authored-by: yaoyu-33 <[email protected]> Co-authored-by: Oliver Koenig <[email protected]>
- Loading branch information
1 parent
d31653f
commit 1dd53c3
Showing
11 changed files
with
157 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import nemo_run as run | ||
|
||
from nemo.collections import vlm | ||
|
||
|
||
def configure_recipe(nodes: int = 1, gpus_per_node: int = 1): | ||
# pylint: disable=C0115,C0116 | ||
recipe = vlm.mllama_11b.finetune_recipe( | ||
dir="/checkpoints/mllama", # Path to store checkpoints | ||
name="mllama", | ||
num_nodes=nodes, | ||
num_gpus_per_node=gpus_per_node, | ||
peft_scheme="lora", | ||
) | ||
recipe.trainer.max_steps = 100 | ||
recipe.trainer.val_check_interval = 100 | ||
return recipe | ||
|
||
|
||
def local_executor_torchrun(nodes: int = 1, devices: int = 1) -> run.LocalExecutor: | ||
# pylint: disable=C0115,C0116 | ||
# Env vars for jobs are configured here | ||
env_vars = {} | ||
|
||
executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) | ||
|
||
return executor | ||
|
||
|
||
def run_training(): | ||
# pylint: disable=C0115,C0116 | ||
recipe = configure_recipe() | ||
executor = local_executor_torchrun(nodes=recipe.trainer.num_nodes, devices=recipe.trainer.devices) | ||
|
||
run.run(recipe, executor=executor) | ||
|
||
|
||
# This condition is necessary for the script to be compatible with Python's multiprocessing module. | ||
if __name__ == "__main__": | ||
run_training() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import nemo_run as run | ||
|
||
from nemo.collections import vlm | ||
|
||
|
||
def configure_recipe(nodes: int = 1, gpus_per_node: int = 8): | ||
# pylint: disable=C0115,C0116 | ||
recipe = vlm.llava15_7b.finetune_recipe( | ||
dir="/checkpoints/llava", # Path to store checkpoints | ||
name="llava_ft", | ||
num_nodes=nodes, | ||
num_gpus_per_node=gpus_per_node, | ||
peft_scheme="none", | ||
) | ||
recipe.trainer.max_steps = 100 | ||
recipe.trainer.val_check_interval = 100 | ||
recipe.model.config.freeze_vision_model = True | ||
return recipe | ||
|
||
|
||
def local_executor_torchrun(nodes: int = 1, devices: int = 8) -> run.LocalExecutor: | ||
# pylint: disable=C0115,C0116 | ||
# Env vars for jobs are configured here | ||
env_vars = {} | ||
|
||
executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) | ||
|
||
return executor | ||
|
||
|
||
def run_training(): | ||
# pylint: disable=C0115,C0116 | ||
recipe = configure_recipe() | ||
executor = local_executor_torchrun(nodes=recipe.trainer.num_nodes, devices=recipe.trainer.devices) | ||
|
||
run.run(recipe, executor=executor) | ||
|
||
|
||
# This condition is necessary for the script to be compatible with Python's multiprocessing module. | ||
if __name__ == "__main__": | ||
run_training() |