diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 474a9a80915e..6b2470791a86 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2937,7 +2937,7 @@ jobs: with: RUNNER: self-hosted-azure-gpus-2-h100 SCRIPT: | - CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ trainer.devices=2 \ trainer.log_every_n_steps=1 \ trainer.max_epochs=9999 \ @@ -2965,7 +2965,6 @@ jobs: +model.tp_comm_overlap_ag=False \ +model.tp_comm_overlap_rs=False \ +model.tp_comm_overlap_disable_qkv=True \ - +model.attention_backend="unfused" \ model.peft.peft_scheme="lora" \ model.peft.lora_tuning.adapter_dim=16 \ model.peft.lora_tuning.alpha=32 \ @@ -4354,7 +4353,7 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | - python3 tests/collections/llm/megatron_mixtral_pretraining.py \ + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \ --experiment-dir=/tmp/mixtral_pretrain_results \ --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml index 47d4657dfe4f..3af15294b2a2 100644 --- a/.github/workflows/import-test.yml +++ b/.github/workflows/import-test.yml @@ -1,52 +1,73 @@ name: CI-Import-Check on: + push: pull_request: paths: - "**" # Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags jobs: - test-imports: - name: test-${{ matrix.collection }}-import-${{ matrix.os }}-py${{ matrix.python }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, macos-latest] - collection: - - asr - # - nlp # Currently broken - - tts - python: ['3.10', '3.11', '3.12'] + + test-asr-imports: + runs-on: ubuntu-latest + container: + image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime steps: - name: Checkout repo uses: actions/checkout@v2 - - uses: actions/setup-python@v5 - with: - python-version: '${{ matrix.python }}' - - name: Build wheel + - name: Update base dependencies + run: | + apt-get update && apt-get install -y build-essential + apt-get install -y libsndfile1 make + - name: Install nemo dependencies id: nemo-wheel run: | + pip install Cython + # install test requirements + pip install -r requirements/requirements_test.txt # Build nemo as a wheel pip install build - python -m build --wheel - + python -m build --no-isolation --wheel # Preserve wheel location DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) - echo "DIST_FILE=${DIST_FILE}" | tee -a "$GITHUB_OUTPUT" - - - name: Install NeMo + test dependencies + echo "::set-output name=DIST_FILE::${DIST_FILE}" + - name: Test ASR Domain Imports + run: | + # Install NeMo Domain + pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]" + # Run import checks + python tests/core_ptl/check_imports.py --domain "asr" + # Uninstall NeMo + pip uninstall -y nemo_toolkit + test-tts-imports: + runs-on: ubuntu-latest + container: + image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime + steps: + - name: Checkout repo + uses: actions/checkout@v2 + - name: Update base dependencies run: | + apt-get update && apt-get install -y build-essential + apt-get install -y libsndfile1 make + - name: Install nemo dependencies + id: nemo-wheel + run: | + pip install Cython # install test requirements pip install -r requirements/requirements_test.txt - - # Install NeMo Domain - pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[test,${{ matrix.collection }}]" - - - name: Run ${{ matrix.collection }} checks + # Build nemo as a wheel + pip install build + python -m build --no-isolation --wheel + # Preserve wheel location + DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) + echo "::set-output name=DIST_FILE::${DIST_FILE}" + - name: Test TTS Domain Imports run: | + # Install NeMo Domain + pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]" # Run import checks - python tests/core_ptl/check_imports.py --domain "${{ matrix.collection }}" - - \ No newline at end of file + python tests/core_ptl/check_imports.py --domain "tts" + # Uninstall NeMo + pip uninstall -y nemo_toolkit diff --git a/Dockerfile.ci b/Dockerfile.ci index 88378b780656..e93d00d03195 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,12 +34,17 @@ EOF WORKDIR /workspace # Install Mamba Dependancy -ARG CAUSAL_CONV_TAG=v1.2.2.post1 -ARG MAMBA_TAG=v2.2.0 +ARG CAUSAL_CONV_TAG=v1.2.2.post1 RUN <<"EOF" bash -ex # Mamba dependancy installation -MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 install --no-cache-dir -v git+https://github.com/Dao-AILab/causal-conv1d.git@${CAUSAL_CONV_TAG} git+https://github.com/state-spaces/mamba.git@${MAMBA_TAG} + +git clone --depth 1 --branch ${CAUSAL_CONV_TAG} https://github.com/Dao-AILab/causal-conv1d && \ + cd causal-conv1d && \ + python setup.py install && \ + cd .. && \ + rm -rf causal-conv1d + EOF RUN pip install hatchling # needed to install nemo-run @@ -49,6 +54,8 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.21.0 +ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa + ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ @@ -58,6 +65,7 @@ RUN \ --mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \ "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \ +"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \ "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \ "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \ "unstructured==0.14.9" \ @@ -65,15 +73,15 @@ pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.n "onnxscript @ git+https://github.com/microsoft/onnxscript" \ -r tools/ctc_segmentation/requirements.txt \ ".[all]" -EOF -ARG MCORE_TAG=4dc8977167d71f86bdec47a60a98e85c4cfa0031 -RUN <<"EOF" bash -ex -# Megatron-LM installation -git clone https://github.com/NVIDIA/Megatron-LM.git -pushd Megatron-LM -git checkout ${MCORE_TAG} -pip install -e . +# Megatron Core installation +git clone https://github.com/NVIDIA/Megatron-LM.git && \ +pushd Megatron-LM && \ +git checkout ${MCORE_TAG} && \ + pushd megatron/core/datasets && \ + make && \ + popd && \ +popd export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM" # Install nvidia-resiliency-ext diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst index 69f1c3219093..26732283e8f4 100644 --- a/docs/source/nlp/information_retrieval.rst +++ b/docs/source/nlp/information_retrieval.rst @@ -70,7 +70,9 @@ Then you can fine-tune the sentence-BERT model using the following script: VALIDATION_DATASET_PATH= # Path to validation dataset SAVE_DIR= # where the checkpoint and logs are saved mkdir -p $SAVE_DIR + export NVTE_FLASH_ATTN=0 export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 + export NVTE_FUSED_ATTN=0 python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \ --config-path=${CONFIG_PATH} \ @@ -85,7 +87,6 @@ Then you can fine-tune the sentence-BERT model using the following script: model.post_process=False \ model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size model.micro_batch_size=8 \ - model.attention_backend="unfused" \ model.optim.lr=0.000005 \ model.optim.sched.min_lr=0.00000001 \ model.optim.sched.warmup_steps=100 \ diff --git a/nemo/collections/diffusion/scripts/train.sh b/nemo/collections/diffusion/scripts/train.sh index ced479e32526..2150458e9376 100644 --- a/nemo/collections/diffusion/scripts/train.sh +++ b/nemo/collections/diffusion/scripts/train.sh @@ -20,6 +20,7 @@ export WANDB_PROJECT=xxx export WANDB_RUN_ID=xxx export WANDB_RESUME=allow +export NVTE_FUSED_ATTN=0 export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index 4d8d541deaa8..bf828bb66277 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -18,7 +18,6 @@ import torch from megatron.core import parallel_state -from megatron.core.transformer.enums import AttnBackend from torch import nn from nemo.collections.llm.fn.activation import openai_gelu @@ -54,8 +53,6 @@ class GemmaConfig(GPTConfig): # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script # The present implementation is more in line with the official implementation layernorm_zero_centered_gamma: bool = True - # Disable cuDNN attention since TE 1.8 does not support head dim > 128 - attention_backend: AttnBackend = AttnBackend.flash @dataclass diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py index 64af8192929c..3b43bbdb0e62 100644 --- a/nemo/collections/llm/recipes/gemma_2b.py +++ b/nemo/collections/llm/recipes/gemma_2b.py @@ -51,6 +51,8 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + os.environ['NVTE_FUSED_ATTN'] = "0" return run.Config(GemmaModel, config=run.Config(GemmaConfig2B)) diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py index 2ac3419d6587..40e43bda4d5e 100644 --- a/nemo/collections/llm/recipes/gemma_7b.py +++ b/nemo/collections/llm/recipes/gemma_7b.py @@ -51,6 +51,8 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + os.environ['NVTE_FUSED_ATTN'] = "0" return run.Config(GemmaModel, config=run.Config(GemmaConfig7B)) @@ -171,6 +173,8 @@ def pretrain_recipe( For more details on pre-training LLMs with NeMo, see the pre-training guide in the `examples/llm/pretrain/` directory. """ + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + os.environ['NVTE_FUSED_ATTN'] = "0" return run.Partial( fn, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 53daf42f1a07..330f6ffee05b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -50,7 +50,6 @@ try: from megatron.core import ModelParallelConfig, parallel_state from megatron.core.distributed import DistributedDataParallel as McoreDDP - from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal @@ -538,9 +537,6 @@ def build_transformer_config(self) -> TransformerConfig: tp_only_amax_red = self.cfg.get('tp_only_amax_red', False) - attention_backend = self.cfg.get('attention_backend', "auto") - attention_backend = AttnBackend[attention_backend] - # any configs that are not in the nemo model config will be added here config_mapping = { 'apply_query_key_layer_scaling': apply_query_key_layer_scaling, @@ -565,7 +561,6 @@ def build_transformer_config(self) -> TransformerConfig: 'rotary_interleaved': rotary_interleaved, 'deallocate_pipeline_outputs': True, 'tp_only_amax_red': tp_only_amax_red, - 'attention_backend': attention_backend, } # populate the transformer config dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py index b3fd7b11c6eb..493d512fd30e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py @@ -76,7 +76,6 @@ from megatron.core.models.retro.utils import get_config_path as get_retro_config_path from megatron.core.models.retro.utils import get_gpt_data_dir as get_retro_data_dir from megatron.core.pipeline_parallel.schedules import get_forward_backward_func - from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal @@ -432,8 +431,6 @@ def build_retro_config(self) -> RetroConfig: te_version = packaging.version.Version(version("transformer-engine")) if te_version >= packaging.version.Version("1.3"): - if HAVE_MEGATRON_CORE: - retro_config.attention_backend = AttnBackend.unfused try: os.environ["NVTE_FLASH_ATTN"] = "0" os.environ["NVTE_FUSED_ATTN"] = "0" diff --git a/nemo/collections/vlm/mllama/model/language.py b/nemo/collections/vlm/mllama/model/language.py index 3edc6706defb..bec3ec526f6e 100644 --- a/nemo/collections/vlm/mllama/model/language.py +++ b/nemo/collections/vlm/mllama/model/language.py @@ -390,7 +390,7 @@ def sharded_state_dict( layer_prefix = f'{prefix}layers.' num_layers = self.config.num_layers for layer in self.layers: - offset = layer._get_layer_offset(layer.config) + offset = layer._get_layer_offset() global_layer_offset = layer.layer_number - 1 # self.layer_number starts at 1 state_dict_prefix = f'{layer_prefix}{global_layer_offset - offset}.' # module list index in TransformerBlock # pylint: disable=line-too-long sharded_prefix = layer_prefix @@ -403,7 +403,7 @@ def sharded_state_dict( for xlayer in self.xattn_layers: if isinstance(xlayer, DummyCrossAttentionTransformerLayer): continue - offset = xlayer._get_layer_offset(xlayer.config) + offset = xlayer._get_layer_offset() global_layer_offset = xlayer.layer_number - 1 state_dict_prefix = f'{xlayer_prefix}{global_layer_offset - offset}.' # module list index in TransformerBlock # pylint: disable=line-too-long sharded_prefix = f'{xlayer_prefix}{global_layer_offset}.' diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py index c830a5de63f6..0c559d1b3990 100644 --- a/nemo/lightning/pytorch/callbacks/peft.py +++ b/nemo/lightning/pytorch/callbacks/peft.py @@ -448,7 +448,7 @@ def load_checkpoint( if getattr(path, "base_model_path", None): ## PEFT Resume, FIRST TIME self.adapter_ckpt_path = Path(str(path)) - adapter_ckpt = self.checkpoint_io.load_checkpoint(path, sharded_state_dict={}) # Loads only metadata + adapter_ckpt = self.checkpoint_io.load_checkpoint(path) # Loads only metadata # path is adapter path to restore the training metadata, but switch to loading base model here. path = self.model_ckpt_path = path.base_model_path elif adapter_meta_path.exists(): diff --git a/pyproject.toml b/pyproject.toml index af5555f9d0dc..bdddfef27dc6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,9 @@ classifiers = [ "Topic :: Utilities", ] +[tool.setuptools.dynamic] +dependencies = { file = ["requirements/requirements.txt"] } + [tool.setuptools] py-modules = ["nemo"] diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index cd30431e8ca9..6a86dacbfefb 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -8,6 +8,7 @@ gdown h5py ijson jieba +mamba-ssm==2.2.2; sys_platform == 'linux' markdown2 matplotlib>=3.3.2 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py index 8265da57f656..14baca53f165 100644 --- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py @@ -84,8 +84,6 @@ def convert(args): nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict(), mcore_bert=args.mcore) nemo_config.trainer["precision"] = args.precision - # Bert doesn't support FLASH_ATTN - nemo_config.model["attention_backend"] = "fused" trainer = MegatronTrainerBuilder(nemo_config).create_trainer() model = MegatronBertModel(nemo_config.model, trainer) @@ -290,5 +288,6 @@ def convert(args): if __name__ == '__main__': + os.environ['NVTE_FLASH_ATTN'] = '0' # Bert doesn't support FLASH_ATTN args = get_args() convert(args) diff --git a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py index 654a2a9e05a8..b4f95879bad5 100644 --- a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py +++ b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py @@ -17,7 +17,6 @@ import torch from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig -from megatron.core.transformer.enums import AttnBackend from megatron.core.utils import init_method_normal, scaled_init_method_normal from nemo.collections.llm import MixtralConfig8x7B, MixtralModel, PreTrainingDataModule @@ -103,7 +102,6 @@ def main(args): bias_dropout_fusion=True, apply_rope_fusion=True, distribute_saved_activations=False, - attention_backend=AttnBackend.unfused, ) data = PreTrainingDataModule( diff --git a/tests/collections/llm/bitexact/mixtral/run.sh b/tests/collections/llm/bitexact/mixtral/run.sh index 0f6612b3d21b..87bf7c382b99 100644 --- a/tests/collections/llm/bitexact/mixtral/run.sh +++ b/tests/collections/llm/bitexact/mixtral/run.sh @@ -8,7 +8,7 @@ MCORE_OUTPUT_PATH="/tmp/bex_mixtral_mcore_output/" NEMO_OUTPUT_PATH="/tmp/bex_mixtral_nemo_output/" # Run Mcore -CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 \ +CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 \ torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \ --apply-layernorm-1p --rotary-percent 1.0 --rotary-base 1000000 \ --no-position-embedding --position-embedding-type rope \ @@ -30,7 +30,7 @@ torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \ --split 99,1,0 --log-interval 10 --save-interval 20000 --eval-interval 1000 --eval-iters 32 \ --save "$MCORE_OUTPUT_PATH" \ --log-num-zeros-in-grad --distributed-timeout-minutes 6000 --moe-router-topk 1 --num-experts 2 \ - --moe-router-pre-softmax --expert-model-parallel-size 1 --eval-iters=0 --attention-backend unfused + --moe-router-pre-softmax --expert-model-parallel-size 1 --eval-iters=0 # Run NeMo CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 \ diff --git a/tests/collections/llm/gpt/model/test_model_import.py b/tests/collections/llm/gpt/model/test_model_import.py index b49885718837..9edc235e454f 100644 --- a/tests/collections/llm/gpt/model/test_model_import.py +++ b/tests/collections/llm/gpt/model/test_model_import.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import torch torch.set_grad_enabled(False) @@ -97,8 +95,5 @@ def import_from_hf(config_name, hf_path): if __name__ == '__main__': for config_name, hf_id in config_name_to_hf_id.items(): - for env_var in ['NVTE_FLASH_ATTN', 'NVTE_FUSED_ATTN', 'NVTE_UNFUSED_ATTN']: - if env_var in os.environ: - del os.environ[env_var] src = f'hf:///home/TestData/nemo2_ckpt/{config_name}' import_from_hf(config_name, src) diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py index 3a135b2346be..ef34d4d39a11 100644 --- a/tests/collections/llm/hf/peft_nemorun.py +++ b/tests/collections/llm/hf/peft_nemorun.py @@ -28,6 +28,7 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut "NCCL_NVLS_ENABLE": "0", "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py index b559c04f6cbd..a3daa66ca774 100644 --- a/tests/collections/llm/hf/sft_nemorun.py +++ b/tests/collections/llm/hf/sft_nemorun.py @@ -29,6 +29,7 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut "NCCL_NVLS_ENABLE": "0", "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py index 2a7b1fdfdad6..4123c7b37987 100644 --- a/tests/collections/llm/megatron_mixtral_pretraining.py +++ b/tests/collections/llm/megatron_mixtral_pretraining.py @@ -18,7 +18,6 @@ import torch from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig -from megatron.core.transformer.enums import AttnBackend from nemo.collections.llm import MixtralConfig8x3B, MixtralModel, PreTrainingDataModule from nemo.collections.llm.api import train @@ -118,7 +117,6 @@ def main(args): bf16=True, params_dtype=torch.bfloat16, pipeline_dtype=torch.bfloat16, - attention_backend=AttnBackend.unfused, ) mixtral_config.overlap_param_gather_with_optimizer_step = True diff --git a/tests/conftest.py b/tests/conftest.py index 989c937ab499..118e978e63c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import os import os.path import shutil import tarfile @@ -123,19 +122,6 @@ def reset_singletons(): Singleton._Singleton__instances = {} -@pytest.fixture(autouse=True) -def reset_env_vars(): - # Store the original environment variables before the test - original_env = dict(os.environ) - - # Run the test - yield - - # After the test, restore the original environment - os.environ.clear() - os.environ.update(original_env) - - @pytest.fixture(scope="session") def test_data_dir(): """ diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index 9dbdaa66a25e..32d401b2051f 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -280,7 +280,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path): assert Path(tmp_path).exists() assert Path(tmp_path / "test_no_name" / "default" / "957").exists() - monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION, raising=False) + monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION) # Checks that use_datetime_version False toggle works test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) log_dir = exp_manager(test_trainer, {"exp_dir": str(tmp_path / "test_no_name"), "use_datetime_version": False}) @@ -288,7 +288,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path): assert Path(tmp_path).exists() assert Path(tmp_path / "test_no_name" / "default" / "version_0").exists() - monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION, raising=False) + monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION) # Checks that use_datetime_version False toggle works and version increments test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) log_dir = exp_manager(test_trainer, {"exp_dir": str(tmp_path / "test_no_name"), "use_datetime_version": False}) diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py index 37ea326ad621..e876e6965000 100644 --- a/tests/lightning/test_nemo_resume_from_ckpt.py +++ b/tests/lightning/test_nemo_resume_from_ckpt.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import List, Optional import pytest def set_env(): + os.environ['NVTE_FLASH_ATTN'] = '0' + os.environ['NVTE_FUSED_ATTN'] = '0' os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '0' @@ -27,7 +28,6 @@ def set_env(): import pytest import torch from megatron.core.optimizer import OptimizerConfig -from megatron.core.transformer.enums import AttnBackend import nemo.lightning as nl from nemo.collections import llm @@ -68,8 +68,7 @@ def load_dcp(ckpt_dir, torch_tensor=True): return state_dict -def compare_ckpts(a, b, path: Optional[List[str]] = None): - path = path if path is not None else [] +def compare_ckpts(a, b, path=[]): if isinstance(a, dict): assert isinstance(b, dict) assert set(a.keys()) == set(b.keys()) @@ -126,7 +125,6 @@ def setup_model_optim(log_dir, n_steps, tokenizer, gbs=2, mbs=1): make_vocab_size_divisible_by=128, normalization='RMSNorm', masked_softmax_fusion=False, - attention_backend=AttnBackend.local, ) model = llm.GPTModel(gpt_config, tokenizer=tokenizer) @@ -271,6 +269,8 @@ def train(n_steps, resume): trainer._teardown() set_env() + assert os.environ['NVTE_FLASH_ATTN'] == '0' + assert os.environ['NVTE_FUSED_ATTN'] == '0' assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '0' # Train for 40 steps diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb index 3895c3b74757..b3393d133a45 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb @@ -341,6 +341,7 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", + " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", @@ -456,6 +457,7 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", + " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb index 0bb4367d50e9..e84ff916fc4e 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb @@ -482,6 +482,7 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", + " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", @@ -564,6 +565,7 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", + " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst index 7f5e901659a4..197825c27d58 100644 --- a/tutorials/llm/mamba/mamba.rst +++ b/tutorials/llm/mamba/mamba.rst @@ -103,6 +103,9 @@ Run Fine-Tuning CONFIG_NAME="megatron_mamba_finetuning_config" SAVE_DIR= + export NVTE_FUSED_ATTN=1 + export NVTE_FLASH_ATTN=0 + torchrun --nproc_per_node=${NUM_DEVICES} \ /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py \ --config-path=${CONFIG_PATH} \ @@ -126,7 +129,6 @@ Run Fine-Tuning model.peft.peft_scheme='none' \ model.megatron_amp_O2=True \ model.encoder_seq_length=${SEQ_LEN} \ - model.attention_backend='fused' \ model.data.validation_ds.pad_to_max_length=True \ model.data.train_ds.pad_to_max_length=True \ model.optim.name="distributed_fused_adam" \ @@ -160,6 +162,10 @@ Evaluating the Fine-Tuned Model CONFIG_NAME="megatron_mamba_finetuning_config" SAVE_DIR= + export NVTE_FUSED_ATTN=1 + export NVTE_FLASH_ATTN=0 + + CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/" CONFIG_NAME="megatron_mamba_generate_config" @@ -179,7 +185,6 @@ Evaluating the Fine-Tuned Model exp_manager.exp_dir=${SAVE_DIR} \ exp_manager.resume_if_exists=False \ exp_manager.create_wandb_logger=False \ - model.attention_backend='fused' \ model.megatron_amp_O2=True \ model.peft.restore_from_path=False \ +model.peft.restore_from_ckpt.checkpoint_dir=False \