diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index ebdc99cef847..4ef6c5a9f9df 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -34,30 +34,18 @@ on: description: Last 2000 characters of the test step's log value: ${{ jobs.main.outputs.log }} jobs: - runner-auto-clean: - runs-on: ${{ inputs.RUNNER }} - steps: - - name: Docker system cleanup - run: | - docker system prune -a --filter "until=48h" --force - main: runs-on: ${{ inputs.RUNNER }} outputs: conclusion: ${{ steps.main.conclusion }} log: ${{ steps.main.outputs.log }} - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData + permissions: + actions: write # Required for cancelling workflows steps: - - name: Checkout repository - uses: actions/checkout@v4 + - name: Docker system cleanup + run: | + docker system prune -a --filter "until=48h" --force + - id: main name: Run main script timeout-minutes: ${{ inputs.TIMEOUT }} @@ -66,7 +54,7 @@ jobs: ( set -e - ${{ inputs.SCRIPT }} + docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}' ) 2> >(tee err.log) EXIT_CODE=$? @@ -77,6 +65,9 @@ jobs: - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: failure() && inputs.IS_OPTIONAL == false + - name: after_script if: always() && inputs.AFTER_SCRIPT != ':' - run: ${{ inputs.AFTER_SCRIPT }} \ No newline at end of file + run: | + docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}' + \ No newline at end of file diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 49b26bf13cdd..253e114c78f3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3098,44 +3098,29 @@ jobs: L2_Megatron_GPT_Reranker: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf /home/TestData/nlp/megatron_ir/working_dir - - python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \ - exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ - model.global_batch_size=4 \ - model.micro_batch_size=4 \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.max_epochs=null \ - trainer.max_steps=20 \ - trainer.val_check_interval=10 \ - model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ - model.peft.lora_tuning.adapter_dim=8 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \ - model.data.validation_ds.write_embeddings_to_file=True \ - model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir - rm -rf /home/TestData/nlp/megatron_ir/working_dir - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \ + exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ + model.global_batch_size=4 \ + model.micro_batch_size=4 \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.max_epochs=null \ + trainer.max_steps=20 \ + trainer.val_check_interval=10 \ + model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ + model.peft.lora_tuning.adapter_dim=8 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \ + model.data.validation_ds.write_embeddings_to_file=True \ + model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] diff --git a/docs/source/asr/speech_intent_slot/api.rst b/docs/source/asr/speech_intent_slot/api.rst index d45f24f807f6..4a45715f78f7 100644 --- a/docs/source/asr/speech_intent_slot/api.rst +++ b/docs/source/asr/speech_intent_slot/api.rst @@ -15,10 +15,10 @@ Mixins .. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin :show-inheritance: :members: - :no-index: + :noindex: .. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin :show-inheritance: :members: - :no-index: + :noindex: diff --git a/docs/source/asr/ssl/api.rst b/docs/source/asr/ssl/api.rst index 8e6f83986032..77614e9ad5e3 100644 --- a/docs/source/asr/ssl/api.rst +++ b/docs/source/asr/ssl/api.rst @@ -15,12 +15,12 @@ Mixins .. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin :show-inheritance: :members: - :no-index: + :noindex: .. autoclass:: nemo.core.classes.mixins.access_mixins.AccessMixin :show-inheritance: :members: - :no-index: + :noindex: diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst index 8922c72d63eb..dee215ba0ed8 100644 --- a/docs/source/core/adapters/api.rst +++ b/docs/source/core/adapters/api.rst @@ -9,7 +9,7 @@ Core :members: :member-order: bysource :undoc-members: adapter_module_names - :no-index: + :noindex: ----- @@ -18,7 +18,7 @@ Core :members: :member-order: bysource :undoc-members: adapter_module_names - :no-index: + :noindex: ----- @@ -30,7 +30,7 @@ Adapter Networks :show-inheritance: :members: :member-order: bysource - :no-index: + :noindex: ----- @@ -38,7 +38,7 @@ Adapter Networks :show-inheritance: :members: :member-order: bysource - :no-index: + :noindex: ----- @@ -51,7 +51,7 @@ Adapter Strategies :members: :member-order: bysource :undoc-members: adapter_module_names - :no-index: + :noindex: ----- @@ -60,7 +60,7 @@ Adapter Strategies :members: :member-order: bysource :undoc-members: adapter_module_names - :no-index: + :noindex: ----- @@ -69,4 +69,4 @@ Adapter Strategies :members: :member-order: bysource :undoc-members: adapter_module_names - :no-index: + :noindex: diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst index d8bed1b23a75..d4b38bc147b2 100644 --- a/docs/source/core/adapters/components.rst +++ b/docs/source/core/adapters/components.rst @@ -28,7 +28,7 @@ Adapter modules represent the functional form of the adapter. We discuss an exam :show-inheritance: :members: :member-order: bysource - :no-index: + :noindex: ----- @@ -36,7 +36,7 @@ Adapter modules represent the functional form of the adapter. We discuss an exam :show-inheritance: :members: :member-order: bysource - :no-index: + :noindex: Insertion Form - Module Adapters @@ -72,7 +72,7 @@ We discuss a simple residual additional connection strategy below - that accepts :members: :member-order: bysource :undoc-members: adapter_module_names - :no-index: + :noindex: ----- @@ -81,7 +81,7 @@ We discuss a simple residual additional connection strategy below - that accepts :members: :member-order: bysource :undoc-members: adapter_module_names - :no-index: + :noindex: ----- diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst index 1664fe59d52f..6daa5070a16e 100644 --- a/docs/source/core/exp_manager.rst +++ b/docs/source/core/exp_manager.rst @@ -193,131 +193,132 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut every_n_steps: 1 # How often to update EMA weights validate_original_weights: False # Whether to use original weights for validation calculation or EMA weights -Support for Preemption ----------------------- +.. Support for Preemption + ---------------------- -.. _exp_manager_preemption_support-label: + .. _exp_manager_preemption_support-label: -NeMo adds support for a callback upon preemption while running the models on clusters. The callback takes care of saving the current state of training via the ``.ckpt`` -file followed by a graceful exit from the run. The checkpoint saved upon preemption has the ``*last.ckpt`` suffix and replaces the previously saved last checkpoints. -This feature is useful to increase utilization on clusters. -The ``PreemptionCallback`` is enabled by default. To disable it simply add ``create_preemption_callback: False`` under exp_manager in the config YAML file. + NeMo adds support for a callback upon preemption while running the models on clusters. The callback takes care of saving the current state of training via the ``.ckpt`` + file followed by a graceful exit from the run. The checkpoint saved upon preemption has the ``*last.ckpt`` suffix and replaces the previously saved last checkpoints. + This feature is useful to increase utilization on clusters. + The ``PreemptionCallback`` is enabled by default. To disable it simply add ``create_preemption_callback: False`` under exp_manager in the config YAML file. -Stragglers Detection ----------------------- + Stragglers Detection + ---------------------- -.. _exp_manager_straggler_det_support-label: + .. _exp_manager_straggler_det_support-label: -.. note:: - Stragglers Detection feature is included in the optional NeMo resiliency package. + .. note:: + Stragglers Detection feature is included in the optional NeMo resiliency package. -Distributed training can be affected by stragglers, which are slow workers that slow down the overall training process. -NeMo provides a straggler detection feature that can identify slower GPUs. + Distributed training can be affected by stragglers, which are slow workers that slow down the overall training process. + NeMo provides a straggler detection feature that can identify slower GPUs. -This feature is implemented in the ``StragglerDetectionCallback``, which is disabled by default. + This feature is implemented in the ``StragglerDetectionCallback``, which is disabled by default. -The callback computes normalized GPU performance scores, which are scalar values ranging from 0.0 (worst) to 1.0 (best). -A performance score can be interpreted as the ratio of current performance to reference performance. + The callback computes normalized GPU performance scores, which are scalar values ranging from 0.0 (worst) to 1.0 (best). + A performance score can be interpreted as the ratio of current performance to reference performance. -There are two types of performance scores provided by the callback: - - Relative GPU performance score: The best-performing GPU in the workload is used as a reference. - - Individual GPU performance score: The best historical performance of the GPU is used as a reference. + There are two types of performance scores provided by the callback: + - Relative GPU performance score: The best-performing GPU in the workload is used as a reference. + - Individual GPU performance score: The best historical performance of the GPU is used as a reference. -Examples: - - If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU. - - If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance. + Examples: + - If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU. + - If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance. -If a GPU performance score drops below the specified threshold, it is identified as a straggler. + If a GPU performance score drops below the specified threshold, it is identified as a straggler. -To enable straggler detection, add ``create_straggler_detection_callback: True`` under exp_manager in the config YAML file. -You might also want to adjust the callback parameters: + To enable straggler detection, add ``create_straggler_detection_callback: True`` under exp_manager in the config YAML file. + You might also want to adjust the callback parameters: -.. code-block:: yaml + .. code-block:: yaml - exp_manager: - ... - create_straggler_detection_callback: True - straggler_detection_callback_params: - report_time_interval: 300 # Interval [seconds] of the straggler check - calc_relative_gpu_perf: True # Calculate relative GPU performance - calc_individual_gpu_perf: True # Calculate individual GPU performance - num_gpu_perf_scores_to_log: 5 # Log 5 best and 5 worst GPU performance scores, even if no stragglers are detected - gpu_relative_perf_threshold: 0.7 # Threshold for relative GPU performance scores - gpu_individual_perf_threshold: 0.7 # Threshold for individual GPU performance scores - stop_if_detected: True # Terminate the workload if stragglers are detected - -Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes). - -Fault Tolerance ---------------- + exp_manager: + ... + create_straggler_detection_callback: True + straggler_detection_callback_params: + report_time_interval: 300 # Interval [seconds] of the straggler check + calc_relative_gpu_perf: True # Calculate relative GPU performance + calc_individual_gpu_perf: True # Calculate individual GPU performance + num_gpu_perf_scores_to_log: 5 # Log 5 best and 5 worst GPU performance scores, even if no stragglers are detected + gpu_relative_perf_threshold: 0.7 # Threshold for relative GPU performance scores + gpu_individual_perf_threshold: 0.7 # Threshold for individual GPU performance scores + stop_if_detected: True # Terminate the workload if stragglers are detected -.. _exp_manager_fault_tolerance_support-label: + Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes). -.. note:: - Fault Tolerance feature is included in the optional NeMo resiliency package. -When training DNN models, faults may occur, hindering the progress of the entire training process. -This is particularly common in distributed, multi-node training scenarios, with many nodes and GPUs involved. +.. Fault Tolerance + --------------- -NeMo incorporates a fault tolerance mechanism to detect training halts. -In response, it can terminate a hung workload and, if requested, restart it from the last checkpoint. + .. _exp_manager_fault_tolerance_support-label: -Fault tolerance ("FT") relies on a special launcher (``ft_launcher``), which is a modified ``torchrun``. -The FT launcher runs background processes called rank monitors. **You need to use ft_launcher to start -your workload if you are using FT**. I.e., `NeMo-Framework-Launcher `_ -can be used to generate SLURM batch scripts with FT support. + .. note:: + Fault Tolerance feature is included in the optional NeMo resiliency package. -Each training process (rank) sends `heartbeats` to its monitor during training and validation steps. -If a rank monitor stops receiving `heartbeats`, a training failure is detected. + When training DNN models, faults may occur, hindering the progress of the entire training process. + This is particularly common in distributed, multi-node training scenarios, with many nodes and GPUs involved. -Fault detection is implemented in the ``FaultToleranceCallback`` and is disabled by default. -To enable it, add a ``create_fault_tolerance_callback: True`` option under ``exp_manager`` in the -config YAML file. Additionally, you can customize FT parameters by adding ``fault_tolerance`` section: + NeMo incorporates a fault tolerance mechanism to detect training halts. + In response, it can terminate a hung workload and, if requested, restart it from the last checkpoint. -.. code-block:: yaml + Fault tolerance ("FT") relies on a special launcher (``ft_launcher``), which is a modified ``torchrun``. + The FT launcher runs background processes called rank monitors. **You need to use ft_launcher to start + your workload if you are using FT**. I.e., `NeMo-Framework-Launcher `_ + can be used to generate SLURM batch scripts with FT support. - exp_manager: - ... - create_fault_tolerance_callback: True - fault_tolerance: - initial_rank_heartbeat_timeout: 600 # wait for 10 minutes for the initial heartbeat - rank_heartbeat_timeout: 300 # wait for 5 minutes for subsequent heartbeats - calculate_timeouts: True # estimate more accurate timeouts based on observed intervals - -Timeouts for fault detection need to be adjusted for a given workload: - * ``initial_rank_heartbeat_timeout`` should be long enough to allow for workload initialization. - * ``rank_heartbeat_timeout`` should be at least as long as the longest possible interval between steps. - -**Importantly, `heartbeats` are not sent during checkpoint loading and saving**, so time for -checkpointing related operations should be taken into account. - -If ``calculate_timeouts: True`` timeouts will be automatically estimated based on observed intervals. -Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated -at the end of a training run, when checkpoint loading and saving were observed**. Hence, in a multi-part -training started from scratch, estimated timeouts won't be available during initial two runs. -Estimated timeouts are stored in a separate JSON file. - -``max_subsequent_job_failures`` allows for the automatic continuation of training on a SLURM cluster. -This feature requires SLURM job to be scheduled with ``NeMo-Framework-Launcher``. If ``max_subsequent_job_failures`` -value is `>0` continuation job is prescheduled. It will continue the work until ``max_subsequent_job_failures`` -subsequent jobs failed (SLURM job exit code is `!= 0`) or the training is completed successfully -("end of training" marker file is produced by the ``FaultToleranceCallback``, i.e. due to iters or time limit reached). - -All FT configuration items summary: - * ``workload_check_interval`` (float, default=5.0) Periodic workload check interval [seconds] in the workload monitor. - * ``initial_rank_heartbeat_timeout`` (Optional[float], default=60.0 * 60.0) Timeout [seconds] for the first heartbeat from a rank. - * ``rank_heartbeat_timeout`` (Optional[float], default=45.0 * 60.0) Timeout [seconds] for subsequent heartbeats from a rank. - * ``calculate_timeouts`` (bool, default=True) Try to calculate ``rank_heartbeat_timeout`` and ``initial_rank_heartbeat_timeout`` - based on the observed heartbeat intervals. - * ``safety_factor``: (float, default=5.0) When calculating the timeouts, multiply the maximum observed heartbeat interval - by this factor to obtain the timeout estimate. Can be made smaller for stable environments and larger for unstable ones. - * ``rank_termination_signal`` (signal.Signals, default=signal.SIGKILL) Signal used to terminate the rank when failure is detected. - * ``log_level`` (str, default='INFO') Log level for the FT client and server(rank monitor). - * ``max_rank_restarts`` (int, default=0) Used by FT launcher. Max number of restarts for a rank. - If ``>0`` ranks will be restarted on existing nodes in case of a failure. - * ``max_subsequent_job_failures`` (int, default=0) Used by FT launcher. How many subsequent job failures are allowed until stopping autoresuming. - ``0`` means do not autoresume. - * ``additional_ft_launcher_args`` (str, default='') Additional FT launcher params (for advanced use). + Each training process (rank) sends `heartbeats` to its monitor during training and validation steps. + If a rank monitor stops receiving `heartbeats`, a training failure is detected. + + Fault detection is implemented in the ``FaultToleranceCallback`` and is disabled by default. + To enable it, add a ``create_fault_tolerance_callback: True`` option under ``exp_manager`` in the + config YAML file. Additionally, you can customize FT parameters by adding ``fault_tolerance`` section: + + .. code-block:: yaml + + exp_manager: + ... + create_fault_tolerance_callback: True + fault_tolerance: + initial_rank_heartbeat_timeout: 600 # wait for 10 minutes for the initial heartbeat + rank_heartbeat_timeout: 300 # wait for 5 minutes for subsequent heartbeats + calculate_timeouts: True # estimate more accurate timeouts based on observed intervals + + Timeouts for fault detection need to be adjusted for a given workload: + * ``initial_rank_heartbeat_timeout`` should be long enough to allow for workload initialization. + * ``rank_heartbeat_timeout`` should be at least as long as the longest possible interval between steps. + + **Importantly, `heartbeats` are not sent during checkpoint loading and saving**, so time for + checkpointing related operations should be taken into account. + + If ``calculate_timeouts: True`` timeouts will be automatically estimated based on observed intervals. + Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated + at the end of a training run, when checkpoint loading and saving were observed**. Hence, in a multi-part + training started from scratch, estimated timeouts won't be available during initial two runs. + Estimated timeouts are stored in a separate JSON file. + + ``max_subsequent_job_failures`` allows for the automatic continuation of training on a SLURM cluster. + This feature requires SLURM job to be scheduled with ``NeMo-Framework-Launcher``. If ``max_subsequent_job_failures`` + value is `>0` continuation job is prescheduled. It will continue the work until ``max_subsequent_job_failures`` + subsequent jobs failed (SLURM job exit code is `!= 0`) or the training is completed successfully + ("end of training" marker file is produced by the ``FaultToleranceCallback``, i.e. due to iters or time limit reached). + + All FT configuration items summary: + * ``workload_check_interval`` (float, default=5.0) Periodic workload check interval [seconds] in the workload monitor. + * ``initial_rank_heartbeat_timeout`` (Optional[float], default=60.0 * 60.0) Timeout [seconds] for the first heartbeat from a rank. + * ``rank_heartbeat_timeout`` (Optional[float], default=45.0 * 60.0) Timeout [seconds] for subsequent heartbeats from a rank. + * ``calculate_timeouts`` (bool, default=True) Try to calculate ``rank_heartbeat_timeout`` and ``initial_rank_heartbeat_timeout`` + based on the observed heartbeat intervals. + * ``safety_factor``: (float, default=5.0) When calculating the timeouts, multiply the maximum observed heartbeat interval + by this factor to obtain the timeout estimate. Can be made smaller for stable environments and larger for unstable ones. + * ``rank_termination_signal`` (signal.Signals, default=signal.SIGKILL) Signal used to terminate the rank when failure is detected. + * ``log_level`` (str, default='INFO') Log level for the FT client and server(rank monitor). + * ``max_rank_restarts`` (int, default=0) Used by FT launcher. Max number of restarts for a rank. + If ``>0`` ranks will be restarted on existing nodes in case of a failure. + * ``max_subsequent_job_failures`` (int, default=0) Used by FT launcher. How many subsequent job failures are allowed until stopping autoresuming. + ``0`` means do not autoresume. + * ``additional_ft_launcher_args`` (str, default='') Additional FT launcher params (for advanced use). .. _nemo_multirun-label: @@ -493,4 +494,4 @@ ExpManagerConfig :show-inheritance: :members: :member-order: bysource - :no-index: + :noindex: diff --git a/docs/source/core/neural_types.rst b/docs/source/core/neural_types.rst index ec7d94336c05..989cc8d998f4 100644 --- a/docs/source/core/neural_types.rst +++ b/docs/source/core/neural_types.rst @@ -24,7 +24,7 @@ Types are implemented in ``nemo.core.neural_types.NeuralType`` class. When you i are expected to include both *axes* information and *element type* information. .. autoclass:: nemo.core.neural_types.NeuralType - :no-index: + :noindex: Type Comparison Results ----------------------- @@ -32,7 +32,7 @@ Type Comparison Results When comparing two neural types, the following comparison results are generated. .. autoclass:: nemo.core.neural_types.NeuralTypeComparisonResult - :no-index: + :noindex: Examples -------- @@ -115,7 +115,7 @@ Custom element types It is possible to create user-defined element types to express the semantics of elements in your tensors. To do so, the user will need to inherit and implement abstract methods of the ``nemo.core.neural_types.elements.ElementType`` class .. autoclass:: nemo.core.neural_types.elements.ElementType - :no-index: + :noindex: Note that element types can be parametrized. Consider this example where it distinguishes between audio sampled at 8Khz and 16Khz. diff --git a/docs/source/features/moe.rst b/docs/source/features/moe.rst index e986465131d6..4c935f9f16a7 100644 --- a/docs/source/features/moe.rst +++ b/docs/source/features/moe.rst @@ -4,7 +4,7 @@ Mixture of Experts Overview -------- -NeMo supports Mixture of Experts (MoE) in the transformer layer for NLP models. +NeMo Framework supports Mixture of Experts (MoE) in the feedforward block of the transformer layer. MoE is a machine learning technique where multiple specialized models (experts, usually multi-layer perceptrons) are combined to solve a complex task. Each expert @@ -12,6 +12,9 @@ focuses on a specific subtask or domain, while a gating network dynamically acti the most appropriate expert based on the current input. +Use MoE +------- + To use MoE in the NeMo Framework, adjust the ``num_moe_experts`` parameter in the model configuration: 1. Set ``num_moe_experts`` to `8` to leverage 8 experts in the MoE module. @@ -26,6 +29,9 @@ To use MoE in the NeMo Framework, adjust the ``num_moe_experts`` parameter in t moe_router_topk: 2 # Processes each token using 2 experts. +Configure MoE-specific Loss Functions +------------------------------------- + In addition, NeMo provides options to configure MoE-specific loss function. To balance token distribution across experts: @@ -35,7 +41,7 @@ To balance token distribution across experts: moe_router_load_balancing_type: aux_loss # to use the auxilary loss, other options include "sinkhorn". -2. Set ``moe_aux_loss_coeff`` to specify the weight of the auxilary loss. Values in the 1e-2 range are a good start, as follows: +2. Set ``moe_aux_loss_coeff`` to specify the weight of the auxilary loss. The auxiliary loss is added to encourage distributing tokens equally among all experts. Values in the 1e-2 range are a good start, as follows: .. code-block:: yaml @@ -52,16 +58,18 @@ Other options include: 1. ``moe_input_jitter_eps`` adds noise to the input tensor by applying jitter with a specified epsilon value. 2. ``moe_token_dropping`` enables selectively dropping and padding tokens for each expert to achieve - a specified capacity. + a specified capacity, similar to GShard, Switch-Transformer, and DeepSpeed-MoE. Briefly, if the number + of tokens routed to an expert exceeds its capacity, then the exceeding tokens are dropped. Note that this is + currently unsupported so should remain False. -3. ``moe_token_dropping`` specifies the token dispatcher type, options include 'allgather' and 'alltoall'. +3. ``moe_token_dispatcher_type`` specifies the token dispatcher type, options include 'allgather' and 'alltoall'. 4. ``moe_per_layer_logging`` enables per-layer logging for MoE, currently support aux-loss and z-loss. -5. ``moe_expert_capacity_factor`` the capacity factor for each expert, None means no token will be dropped. The default is None. +5. ``moe_expert_capacity_factor`` the capacity factor determines the maximum number of tokens that can be routed to each expert in any MoE layer. None means no token will be dropped. The default is None. -6. ``moe_pad_expert_input_to_capacity`` if True, pads the input for each expert to match the expert capacity length, effective only after the moe_expert_capacity_factor is set. The default setting is False. +6. ``moe_pad_expert_input_to_capacity`` if True, pads the input for each expert to match the expert capacity length. It is effective only after the moe_expert_capacity_factor is set. The default setting is False. -7. ``moe_token_drop_policy`` the policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. Default value is "probs". +7. ``moe_token_drop_policy`` the policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. The default value is "probs". -8. ``moe_layer_recompute`` if True, checkpointing moe_layer to save activation memory, default is False. \ No newline at end of file +8. ``moe_layer_recompute`` if True, checkpointing moe_layer to save activation memory. The default is False. diff --git a/docs/source/features/optimizations/communication_overlap.rst b/docs/source/features/optimizations/communication_overlap.rst index c4bbbc0bbc75..0ff93fe80604 100644 --- a/docs/source/features/optimizations/communication_overlap.rst +++ b/docs/source/features/optimizations/communication_overlap.rst @@ -26,7 +26,8 @@ The TP communication and computation are chunked and the chunks are overlapped i In the pipelined overlap, the activation (gradient) tensor all-gather is replaced with multiple steps of input P2P ring exchanges, and reduce-scatter is replaced with multiple steps of GEMM output P2P ring exchanges followed by a reduction of the received outputs. In case of the reduce-scatter overlap, NeMo also provides the option to pipeline-overlap using chunks of reduce-scatter, which exposes one reduce-scatter chunk. -.. image:: ../nlp/nemo_megatron/images/tp_comm_overlap.png + +.. image:: ../../nlp/nemo_megatron/images/tp_comm_overlap.png :align: center :width: 600px :alt: Tensor-parallel communication overlap @@ -44,7 +45,7 @@ This increasing PP communication overhead and it cancels off the reduced the pip NeMo supports the overlap of the PP communications with non-dependant computations in the 1F1B stage (the body of pipelining, where 1X forward and 1X backward micro-batch executions are interleaved). The PP communications in pipeline fill and flush are still exposed. -.. image:: ../nlp/nemo_megatron/images/pp_comm_overlap.png +.. image:: ../../nlp/nemo_megatron/images/pp_comm_overlap.png :align: center :width: 600px :alt: Pipeline-parallel communication overlap in 1F1B pipelining phase diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst index 7a9fe2822d07..2ba9978b7640 100644 --- a/docs/source/multimodal/api.rst +++ b/docs/source/multimodal/api.rst @@ -8,7 +8,7 @@ Model Classes :show-inheritance: :no-members: :members: __init__, configure_optimizers - :no-index: + :noindex: .. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion diff --git a/docs/source/multimodal/speech_llm/api.rst b/docs/source/multimodal/speech_llm/api.rst index 142190fd411d..c2415f29c720 100644 --- a/docs/source/multimodal/speech_llm/api.rst +++ b/docs/source/multimodal/speech_llm/api.rst @@ -8,7 +8,7 @@ Model Classes :show-inheritance: :no-members: :members: __init__, configure_optimizers - :no-index: + :noindex: .. autoclass:: nemo.collections.multimodal.speech_llm.models.modular_models.ModularAudioGPTModel diff --git a/nemo/lightning/fabric/strategies.py b/nemo/lightning/fabric/strategies.py index a662386a9119..5c2b634ea282 100644 --- a/nemo/lightning/fabric/strategies.py +++ b/nemo/lightning/fabric/strategies.py @@ -23,7 +23,6 @@ from lightning_fabric.plugins.precision import Precision from lightning_fabric.strategies import DDPStrategy from lightning_fabric.strategies.strategy import _validate_keys_for_strict_loading -from lightning_fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning_fabric.utilities.types import _PATH, _Stateful from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.loops.fetchers import _DataFetcher @@ -208,7 +207,7 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag precision_init_ctx = self.precision.module_init_context() module_sharded_ctx = self.megatron_context() stack = ExitStack() - if _TORCH_GREATER_EQUAL_2_1 and empty_init: + if empty_init: # Materialization happens in `setup`. When modules get wrapped by FSDP, the sequence of operations is: # 1) materialize module 2) call `reset_parameters()` 3) shard the module. # These operations are applied to each submodule 'bottom up' in the module hierarchy.