diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 5f29832f0c0f..67bc69b1f8a5 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -120,7 +120,7 @@ jobs:
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <!subteam^{{ secrets.SLACK_WEBHOOK_ADMIN }}>"
+                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>"
                     }
                   }
                 ]
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2021c7d93136..55a952c21eb6 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -217,15 +217,14 @@ jobs:
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads
 
-  OPTIONAL_L0_Unit_Tests_GPU_Lightning:
+  L0_Unit_Tests_GPU_Lightning:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_GPU_Others:
      needs: [cicd-test-container-setup]
@@ -1333,275 +1332,6 @@ jobs:
                 pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
                 output_manifest=preds.json
 
-  # L2: Duplex Text Normalization
-  L2_Duplex_Text_Normalization_with_Tarred_dataset:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Duplex_Text_Normalization_with_Tarred_dataset') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/duplex_text_normalization && \
-        python duplex_text_normalization_train.py \
-        data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-        mode=tn \
-        lang=en \
-        tagger_model.do_training=false \
-        decoder_model.transformer=t5-small \
-        data.validation_ds.batch_size=2 \
-        data.train_ds.use_cache=false \
-        data.validation_ds.use_cache=false \
-        data.test_ds.batch_size=2 \
-        data.train_ds.decoder_data_augmentation=false \
-        data.train_ds.num_workers=2 \
-        decoder_trainer.devices=[0,1] \
-        decoder_trainer.accelerator="gpu" \
-        data.train_ds.use_tarred_dataset=true \
-        +decoder_trainer.fast_dev_run=true \
-        decoder_exp_manager.create_checkpoint_callback=false \
-        data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-        data.test_ds.use_cache=false \
-        data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-
-  # L2: Intent and Slot Classification Tasks
-  L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/intent_slot_classification && \
-        python intent_slot_classification.py \
-        model.data_dir=/home/TestData/nlp/retail \
-        model.validation_ds.prefix=dev \
-        model.test_ds.prefix=dev \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        exp_manager.exp_dir=checkpoints
-      AFTER_SCRIPT: |
-        rm -rf checkpoints
-
-  L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/intent_slot_classification && \
-        python multi_label_intent_slot_classification.py \
-        model.data_dir=/home/TestData/nlp/new_multiatis \
-        model.validation_ds.prefix=dev \
-        model.test_ds.prefix=dev \
-        trainer.devices=1 \
-        +trainer.fast_dev_run=true \
-        exp_manager.exp_dir=checkpoints2
-      AFTER_SCRIPT: |
-        rm -rf checkpoints2
-
-    # TODO: add when megatron-bert is supported again
-    # stage("L2: Model Parallel Size 2 Megatron Text Classification") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python text_classification_with_bert.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     trainer.precision=16 \
-    #     trainer.gradient_clip_val=1.0 \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.num_classes=6 \
-    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    #     model.train_ds.batch_size=4 \
-    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    #     model.nemo_path=null \
-    #     ~model.infer_samples \
-    #     exp_manager=null
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Autoresume") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python text_classification_with_bert.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     trainer.precision=16 \
-    #     trainer.gradient_clip_val=1.0 \
-    #     trainer.max_epochs=1 \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.num_classes=6 \
-    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    #     model.train_ds.batch_size=4 \
-    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    #     model.nemo_path=null \
-    #     ~model.infer_samples \
-    #     +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \
-    #     +exp_manager.resume_if_exists=true
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python model_parallel_text_classification_evaluation.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     model.dataset.num_classes=6 \
-    #     model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-    #     model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \
-    #     exp_manager=null
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/token_classification && \
-    #     python token_classification_train.py \
-    #     pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \
-    #     model.dataset.data_dir=/home/TestData/nlp/ner/ \
-    #     model.train_ds.batch_size=2 \
-    #     model.dataset.use_cache=false \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.class_balancing="weighted_loss" \
-    #     exp_manager=null
-    #   }
-    # }
-
-
-  # L2: Parallel NLP Examples 2
-  L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        python token_classification_train.py \
-        pretrained_model=ner_en_bert \
-        model.dataset.data_dir=/home/TestData/nlp/ner/ \
-        model.train_ds.batch_size=2 \
-        model.dataset.use_cache=false \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        model.dataset.class_balancing="weighted_loss" \
-        exp_manager.exp_dir=null
-
-  L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-        python punctuation_capitalization_train_evaluate.py \
-          pretrained_model=punctuation_en_bert \
-          model.train_ds.ds_item="${data_dir}" \
-          model.validation_ds.ds_item="${data_dir}" \
-          model.test_ds.ds_item="${data_dir}" \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=1 \
-          trainer.accelerator="gpu" \
-          +trainer.fast_dev_run=true \
-          exp_manager.exp_dir=null;
-
-        rm -rf "${data_dir}"
-
-  L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        python token_classification_train.py \
-        model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
-        exp_manager.exp_dir=null
-
-  L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/token_classification/token_classification_evaluate.py \
-        model.dataset.data_dir=/home/TestData/nlp/ner/ \
-        model.dataset.use_cache=false \
-        pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
-
-  L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-          +do_training=false \
-          +do_testing=true \
-          model.test_ds.ds_item="${data_dir}" \
-          ~model.train_ds \
-          ~model.validation_ds \
-          +model.test_ds.use_cache=false \
-          pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo;
-
-        rm -rf "${data_dir}"
-
-
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
   L2_Pretraining_BERT_pretraining_from_Text:
     needs: [cicd-test-container-setup]
@@ -1978,313 +1708,6 @@ jobs:
         model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
       AFTER_SCRIPT: |
         rm -rf examples/nlp/machine_translation/megatron_nmt_results
-
-  L2_Megatron_BART_Perceiver_MIM_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Perceiver_MIM_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string="\"800,100,100\"" \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5
-        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string="\"800,100,100\"" \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/megatron_mim_results
-
-    # stage("L2: NMT Bottleneck Fallback") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("L2: seq2seq (no bottleneck)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=seq2seq \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-    #           model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-    #           model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null \
-    #         }
-    #     }
-    #   }
-    # }
-    # stage("L2: NMT Bottleneck Architecture") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("Bridge Encoder (identity)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=bridge \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=identity \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #     stage("Perceiver Encoder (params)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #   }
-    # }
-    # stage("L2: NMT Bottleneck LVM") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("VAE") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=vae \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #     stage("MIM") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=mim \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #   }
-    # }
         
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [cicd-test-container-setup]
@@ -2354,114 +1777,42 @@ jobs:
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-  L2_Megatron_Bert_Pretraining_and_Resume_Training:
+  L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bert_pretrain_results
-        rm -rf examples/nlp/language_modeling/bert_index_mappings
-
-  L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-          trainer.devices=2 \
-          trainer.accelerator=gpu \
-          trainer.log_every_n_steps=1 \
-          trainer.val_check_interval=10 \
-          trainer.limit_val_batches=2 \
-          trainer.accumulate_grad_batches=1 \
-          trainer.max_steps=10 \
-          trainer.gradient_clip_val=1.0 \
-          exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-          model.mcore_bert=True \
-          model.tensor_model_parallel_size=2 \
-          model.optim.name=fused_adam \
-          model.optim.lr=2e-4 \
-          model.sequence_parallel=True \
-          model.optim.sched.warmup_steps=2 \
-          model.optim.sched.constant_steps=2 \
-          model.optim.sched.min_lr=8e-5 \
-          model.max_position_embeddings=128 \
-          model.encoder_seq_length=128 \
-          model.data.seq_length=128 \
-          model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-          model.num_layers=8 \
-          model.hidden_size=256 \
-          model.num_attention_heads=8 \
-          model.activations_checkpoint_method="block" \
-          model.activations_checkpoint_num_layers=1 \
-          model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-          model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+          trainer.devices=2 \
+          trainer.accelerator=gpu \
+          trainer.log_every_n_steps=1 \
+          trainer.val_check_interval=10 \
+          trainer.limit_val_batches=2 \
+          trainer.accumulate_grad_batches=1 \
+          trainer.max_steps=10 \
+          trainer.gradient_clip_val=1.0 \
+          exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+          model.mcore_bert=True \
+          model.tensor_model_parallel_size=2 \
+          model.optim.name=fused_adam \
+          model.optim.lr=2e-4 \
+          model.sequence_parallel=True \
+          model.optim.sched.warmup_steps=2 \
+          model.optim.sched.constant_steps=2 \
+          model.optim.sched.min_lr=8e-5 \
+          model.max_position_embeddings=128 \
+          model.encoder_seq_length=128 \
+          model.data.seq_length=128 \
+          model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+          model.num_layers=8 \
+          model.hidden_size=256 \
+          model.num_attention_heads=8 \
+          model.activations_checkpoint_method="block" \
+          model.activations_checkpoint_num_layers=1 \
+          model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+          model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
           python examples/nlp/language_modeling/megatron_bert_pretraining.py \
           trainer.devices=2 \
@@ -2496,228 +1847,6 @@ jobs:
         rm -rf examples/nlp/language_modeling/bert_pretrain_results
         rm -rf examples/nlp/language_modeling/bert_index_mappings
 
-  L2_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.num_nodes=1 \
-        trainer.devices=2 \
-        trainer.precision=bf16 \
-        trainer.accelerator=gpu \
-        model.data.data_prefix=["none"] \
-        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-        model.data.num_workers=4 \
-        model.micro_batch_size=1 \
-        model.data.shuffle_documents=False \
-        trainer.val_check_interval=30 \
-        +trainer.num_sanity_val_steps=0 \
-        model.init_method_std=0.023 \
-        model.optim.lr=6.0e-4 \
-        model.megatron_amp_O2=True \
-        model.data.splits_string="\"98,2,0\"" \
-        model.data.dataloader_type=cyclic \
-        trainer.max_steps=10
-
-        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.num_nodes=1 \
-        trainer.devices=2 \
-        trainer.precision=bf16 \
-        trainer.accelerator=gpu \
-        model.data.data_prefix=["none"] \
-        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-        model.data.num_workers=4 \
-        model.micro_batch_size=1 \
-        model.data.shuffle_documents=False \
-        trainer.val_check_interval=30 \
-        +trainer.num_sanity_val_steps=0 \
-        model.init_method_std=0.023 \
-        model.optim.lr=6.0e-4 \
-        model.megatron_amp_O2=True \
-        model.data.splits_string="\"98,2,0\"" \
-        model.data.dataloader_type=cyclic \
-        trainer.max_steps=20
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/mcore_retro_results
-
-  L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-        trainer.devices=2 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.accumulate_grad_batches=1 \
-        trainer.limit_val_batches=2 \
-        exp_manager.resume_if_exists=True \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-        model.data.data_prefix= \
-        model.data.knn_index= \
-        model.data.retrieval_prefix= \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.chunk_size=32 \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=2 \
-        model.enc_cross_attention=[1] \
-        model.dec_cross_attention=[1] \
-        +model.data.mock=True
-
-            python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.accumulate_grad_batches=1 \
-            trainer.limit_val_batches=2 \
-            exp_manager.resume_if_exists=True \
-            trainer.max_steps=20 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-            model.data.data_prefix= \
-            model.data.knn_index= \
-            model.data.retrieval_prefix= \
-            model.tensor_model_parallel_size=2 \
-            model.micro_batch_size=4 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.chunk_size=32 \
-            model.enc_num_layers=2 \
-            model.dec_num_layers=2 \
-            model.enc_cross_attention=[1] \
-            model.dec_cross_attention=[1] \
-            +model.data.mock=True
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/retro_legacy_results
-
-  # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
-  #               trainer.devices=2 \
-  #               trainer.num_nodes=1 \
-  #               trainer.accelerator=gpu \
-  #               trainer.accumulate_grad_batches=1 \
-  #               trainer.max_steps=100 \
-  #               trainer.log_every_n_steps=1 \
-  #               trainer.precision=16 \
-  #               trainer.val_check_interval=100 \
-  #               trainer.limit_val_batches=0 \
-  #               trainer.gradient_clip_val=1.0 \
-  #               +trainer.num_sanity_val_steps=0 \
-  #               exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
-  #               +exp_manager.version=smalltest \
-  #               model.data.neighbors=2 \
-  #               model.megatron_amp_O2=False \
-  #               model.apply_query_key_layer_scaling=False \
-  #               model.tensor_model_parallel_size=1 \
-  #               model.optim.name=muadamw \
-  #               model.optim.weight_decay=0.1 \
-  #               model.optim.betas=[0.9,0.95] \
-  #               model.optim.lr=6e-4 \
-  #               model.optim.sched.warmup_steps=1000 \
-  #               model.optim.sched.constant_steps=0 \
-  #               model.optim.sched.min_lr=6e-5 \
-  #               model.add_position_embedding=False \
-  #               model.enc_num_layers=2 \
-  #               model.dec_num_layers=6 \
-  #               model.enc_cross_attention=[0] \
-  #               model.dec_cross_attention=[3,5] \
-  #               model.hidden_size=96 \
-  #               model.ffn_hidden_size=384 \
-  #               model.init_method_std=0.023 \
-  #               model.num_attention_heads=12 \
-  #               model.max_position_embeddings=1024 \
-  #               model.encoder_seq_length=1024 \
-  #               model.tokenizer.library=megatron \
-  #               model.tokenizer.type=GPT2BPETokenizer \
-  #               model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
-  #               model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
-  #               model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
-  #               model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
-  #               model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
-  #               model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
-  #               model.data.num_workers=8 \
-  #               model.micro_batch_size=8 \
-  #               model.normalization=rmsnorm \
-  #               model.transformer_block_type=pre_ln \
-  #               model.bias_activation_fusion=True \
-  #               model.bias_dropout_add_fusion=False \
-  #               model.masked_softmax_fusion=True \
-  #               model.hidden_dropout=0 \
-  #               model.attention_dropout=0 \
-  #               model.fp32_residual_connection=True \
-  #               model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml
-
-  #               python -c "import pandas as pd
-  #               import pathlib
-  #               from pandas.testing import assert_frame_equal
-  #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
-  #               import torch
-  #               if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()):
-  #                   import sys
-  #                   sys.exit(0)
-  #               event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0]
-  #               ea = EventAccumulator(str(event_file)).Reload()
-  #               vals = []
-  #               for i in ea.Scalars("reduced_train_loss"):
-  #                   vals.append(i.value)
-  #               training_curve = pd.DataFrame({"loss": vals})
-  #               gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv")
-  #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
-
-  #               rm -rf examples/nlp/language_modeling/retro_results
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
   L2_RAG_Pipeline_Indexing:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -2754,22 +1883,6 @@ jobs:
         generating.inference.temperature=1.0 \
         generating.query="Which art schools did I applied to?"
 
-  L2_BioMegatron_Bert_NER_Task:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_BioMegatron_Bert_NER_Task') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/token_classification/token_classification_train.py \
-        exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
-        trainer.max_epochs=1 \
-        model.dataset.data_dir=/home/TestData/nlp/ner \
-        model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
-        model.tokenizer.tokenizer_name=null
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/token_classification_results
-
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -3354,10 +2467,10 @@ jobs:
         rm -rf examples/nlp/language_modeling/gpt_pretrain_results
         rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
-  L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
+  Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
@@ -3464,6 +2577,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/gpt_pretrain_results
         rm -rf examples/nlp/language_modeling/gpt_index_mappings
+      IS_OPTIONAL: true
 
   OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124:
     needs: [cicd-test-container-setup]
@@ -3924,103 +3038,6 @@ jobs:
       AFTER_SCRIPT: |
         rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
 
-  L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=fast-swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=fast-swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
   L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4476,18 +3493,6 @@ jobs:
         rm -rf examples/nlp/language_modeling/t5_pretrain_results
         rm -rf examples/nlp/language_modeling/t5_index_mappings
 
-  L2_Megatron_T5_Eval:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
-            --tensor_model_parallel_size 1
-
   L2_Megatron_Core_T5_Eval:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4500,196 +3505,6 @@ jobs:
             --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
-  L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="reglu" \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="reglu" \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
-
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=5 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="reglu" \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="reglu" \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bart_pretrain_results
-
-  L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=geglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=geglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bart_pretrain_results
-
- 
-  L2_Megatron_T5_PEFT_Lora_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-
-        python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/tmp/nlp_t5_lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.peft_scheme=lora \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-        python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.restore_from_path=/tmp/nlp_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-        model.peft.restore_from_ckpt_name=null \
-        model.peft.restore_from_hparams_path=null \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=[quarel4] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix=/tmp/nlp_t5_lora_tuning_tp2/out \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path=/tmp/nlp_t5_lora_tuning_tp2/out.jsonl
-
   L2_Megatron_Core_T5_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -5072,7 +3887,7 @@ jobs:
         rm -rf tests/collections/llm/gpt_pretrain_results
         rm -rf tests/collections/llm/gpt_index_mappings
 
-  OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check:
+  L2_NeMo_2_GPT_DDP_Param_Parity_check:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check') || needs.cicd-test-container-setup.outputs.all == 'true'
@@ -5080,7 +3895,7 @@ jobs:
       RUNNER: self-hosted-azure
       SCRIPT: |
 
-        python tests/lightning/test_ddp_parity_checker.py \
+        TORCHDYNAMO_DISABLE=1 python tests/lightning/test_ddp_parity_checker.py \
         --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document
@@ -5088,8 +3903,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/gpt_pretrain_results
         rm -rf tests/collections/llm/gpt_index_mappings
-      IS_OPTIONAL: true
-      
+
   L2_NeMo_2_SSM_Pretraining:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -5170,6 +3984,22 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }}
 
+  L2_NeMo_2_T5_LoRA:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_LoRA') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
+        --devices=2 \
+        --max-steps=250 \
+        --peft=lora \
+        --experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \
+        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
+      AFTER_SCRIPT: |
+        rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }}
+
   L2_NeMo_2_Mixtral_Pretraining:
       needs: [cicd-test-container-setup]
       uses: ./.github/workflows/_test_template.yml
@@ -5209,8 +4039,6 @@ jobs:
         --pp_size 1 \
         --mbs 1
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP1PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5240,8 +4068,6 @@ jobs:
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP1PP2_MBS2:
     needs: [cicd-test-container-setup]
@@ -5271,8 +4097,6 @@ jobs:
         --pp_size 2 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP2PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5302,8 +4126,35 @@ jobs:
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
+
+  L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 3 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft none \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+        
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 6 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft none \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+
 
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1:
     needs: [cicd-test-container-setup]
@@ -5333,8 +4184,6 @@ jobs:
         --pp_size 1 \
         --mbs 1
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5364,8 +4213,6 @@ jobs:
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2:
     needs: [cicd-test-container-setup]
@@ -5395,8 +4242,6 @@ jobs:
         --pp_size 2 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5426,8 +4271,33 @@ jobs:
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
+  L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 3 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft lora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+        
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 6 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft lora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
 
   L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact:
     needs: [cicd-test-container-setup]
@@ -5453,7 +4323,7 @@ jobs:
       - L0_Unit_Tests_GPU_TTS
       #- OPTIONAL_L0_Unit_Tests_GPU_Core
       - L0_Unit_Tests_GPU_Hydra
-      #- OPTIONAL_L0_Unit_Tests_GPU_Lightning
+      - L0_Unit_Tests_GPU_Lightning
       - L0_Unit_Tests_GPU_Others
       
       - L0_Unit_Tests_CPU_ASR
@@ -5499,14 +4369,6 @@ jobs:
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
       - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-      - L2_Duplex_Text_Normalization_with_Tarred_dataset
-      - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification
-      - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification
-      - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test
-      - L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test
-      - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1
-      - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification
-      - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation
       - L2_Pretraining_BERT_pretraining_from_Text
       - L2_Pretraining_BERT_from_Preprocessed
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN
@@ -5517,15 +4379,10 @@ jobs:
       - L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation
       - L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation
       - L2_Megatron_NMT_Training_TP2
-      - L2_Megatron_BART_Perceiver_MIM_Training_TP2
       - L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism
-      - L2_Megatron_Bert_Pretraining_and_Resume_Training
       - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training
-      - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training
-      - L2_Megatron_RETRO_Pretraining_and_Resume_Training
       - L2_RAG_Pipeline_Indexing
       - L2_RAG_Pipeline_Generating
-      - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_Skip_Train
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
@@ -5533,7 +4390,7 @@ jobs:
       - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2
       - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
+      # - Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
       #- OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124
       - L2_Megatron_GPT_Finetuning_PP2
       - L2_Megatron_GPT_Finetuning_StarCoder_PP1
@@ -5546,18 +4403,13 @@ jobs:
       - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len
       - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
-      - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
       - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining
       - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_T5_Eval
       - L2_Megatron_Core_T5_Eval
-      - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2
-      - L2_Megatron_T5_PEFT_Lora_TP2
       - L2_Megatron_Core_T5_PEFT_Lora_TP2
       - L2_Megatron_Mock_Data_Generation_MockGPTDataset
       - L2_Megatron_Mock_Data_Generation_MockT5Dataset
@@ -5570,20 +4422,23 @@ jobs:
       - Speech_Checkpoints_tests
       - L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
-      #- OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check
+      - L2_NeMo_2_GPT_DDP_Param_Parity_check
       - L2_NeMo_2_HF_MODEL_IMPORT
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
       - L2_NeMo_2_T5_Pretraining
       - L2_NeMo_2_T5_Finetuning
+      - L2_NeMo_2_T5_LoRA
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS2
       - L2_NeMo_2_GPT_SFT_TP1PP2_MBS2
       - L2_NeMo_2_GPT_SFT_TP2PP1_MBS2
+      - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2
       - L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2
       - L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
+      - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_Mixtral_Pretraining
       - L2_PTQ_Llama2_INT8_SQ
       - L2_PTQ_Llama2_FP8
@@ -5731,4 +4586,4 @@ jobs:
 
       - name: "Pipeline not successful, set exit code to 1"
         if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
-        run: exit 1
\ No newline at end of file
+        run: exit 1
diff --git a/Dockerfile.ci b/Dockerfile.ci
index dbcd92cfcb65..6ef99a35ae82 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=0d89fc4c0d4394f915fffff11212d6957652337f
+ARG MCORE_TAG=425cdd48d5ef5d360d8033288ff7cb0d378f535f
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
diff --git a/docs/source/asr/ssl/api.rst b/docs/source/asr/ssl/api.rst
index 77614e9ad5e3..16b21bdfb12e 100644
--- a/docs/source/asr/ssl/api.rst
+++ b/docs/source/asr/ssl/api.rst
@@ -4,6 +4,10 @@ NeMo SSL collection API
 
 Model Classes
 -------------
+.. autoclass:: nemo.collections.asr.models.EncDecDenoiseMaskedTokenPredModel
+    :show-inheritance:
+    :members:
+
 .. autoclass:: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel
     :show-inheritance:
     :members: 
diff --git a/docs/source/asr/ssl/intro.rst b/docs/source/asr/ssl/intro.rst
index 76a3a75dcf37..89002711be97 100644
--- a/docs/source/asr/ssl/intro.rst
+++ b/docs/source/asr/ssl/intro.rst
@@ -19,6 +19,10 @@ encoder module of neural ASR models. Here too, majority of SSL effort is focused
 While it is common that AM is the focus of SSL in ASR, it can also be utilized in improving other parts of 
 ASR models (e.g., predictor module in transducer based ASR models).
 
+In NeMo, we provide two types of SSL models, `Wav2Vec-BERT <https://arxiv.org/abs/2108.06209>`_ and `NEST <https://arxiv.org/abs/2408.13106>`_. 
+The training script for them can be found in `https://github.com/NVIDIA/NeMo/tree/main/examples/asr/speech_pretraining`.
+
+
 The full documentation tree is as follows:
 
 .. toctree::
diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md
new file mode 100644
index 000000000000..d9f26dcf0d61
--- /dev/null
+++ b/docs/source/performance/performance_long_sequence.md
@@ -0,0 +1,134 @@
+# Long Sequence Performance
+
+## LLAMA2-7B (FP8)
+
+- The table below shows the pre-training performance of the LLAMA2-7B with CP (context parallelism) and compares it against the results without CP at various input sequence lengths. The detailed model-parallel configurations and the achieved performance are shown in the training results with CP. In non-CP training runs, we use the most performant model- and data-parallel configurations without CP given the memory capacity constraint of the H100 GPU system.
+
+  - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags)
+  - System: DGX-H100
+
+
+<table>
+  <thead>
+    <tr>
+      <th rowspan="2" class="top-border">SeqLen (K)</th>
+      <th rowspan="2" class="top-border"># of GPUs</th>
+      <th rowspan="1" class="top-border">Without CP</th>
+      <th colspan="5" class="top-border">With CP</th>
+      <th rowspan="2" class="top-border">Speedup with CP/without CP</th>
+    </tr>
+    <tr>
+      <th>TFLOPS / GPU</th>
+      <th>TP</th>
+      <th>PP</th>
+      <th>DP</th>
+      <th>CP</th>
+      <th>TFLOPS / GPU</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>4</td>
+      <td>4</td>
+      <td>768</td>
+      <td>1</td>
+      <td>1</td>
+      <td>4</td>
+      <td>1</td>
+      <td>768</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>8</td>
+      <td>8</td>
+      <td>730</td>
+      <td>1</td>
+      <td>2</td>
+      <td>4</td>
+      <td>1</td>
+      <td>730</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>16</td>
+      <td>16</td>
+      <td>660</td>
+      <td>2</td>
+      <td>1</td>
+      <td>8</td>
+      <td>1</td>
+      <td>660</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>32</td>
+      <td>32</td>
+      <td>595</td>
+      <td>2</td>
+      <td>1</td>
+      <td>8</td>
+      <td>2</td>
+      <td>610</td>
+      <td class="speedup">1.03</td>
+    </tr>
+    <tr>
+      <td>64</td>
+      <td>64</td>
+      <td>534</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>2</td>
+      <td>574</td>
+      <td class="speedup">1.07</td>
+    </tr>
+    <tr>
+      <td>128</td>
+      <td>128</td>
+      <td>424</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>4</td>
+      <td>555</td>
+      <td class="speedup">1.31</td>
+    </tr>
+    <tr>
+      <td>256</td>
+      <td>256</td>
+      <td>392</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>8</td>
+      <td>549</td>
+      <td class="speedup">1.40</td>
+    </tr>
+    <tr>
+      <td>512</td>
+      <td>512</td>
+      <td>104</td>
+      <td>8</td>
+      <td>1</td>
+      <td>4</td>
+      <td>16</td>
+      <td>549</td>
+      <td class="speedup">5.28</td>
+    </tr>
+    <tr>
+      <td>1024</td>
+      <td>1024</td>
+      <td>26.5</td>
+      <td>8</td>
+      <td>1</td>
+      <td>4</td>
+      <td>32</td>
+      <td>536</td>
+      <td class="speedup">20.23</td>
+    </tr>
+  </tbody>
+</table>
+
+
+### Speedup of LLAMA2 7B training with CP over without CP
+![cp_speedup_figure](https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/tutorial_cp_speedup_figure.png)
\ No newline at end of file
diff --git a/examples/asr/conf/ssl/nest/nest_fast-conformer.yaml b/examples/asr/conf/ssl/nest/nest_fast-conformer.yaml
index 054c66830d65..2124e6e6f7f1 100644
--- a/examples/asr/conf/ssl/nest/nest_fast-conformer.yaml
+++ b/examples/asr/conf/ssl/nest/nest_fast-conformer.yaml
@@ -28,8 +28,8 @@ model:
   mask_position: pre_conv  # position to apply masking, before or after conv subsampling, choices in ['pre_conv', 'post_conv']
   
   train_ds:
-    manifest_filepath: ???
-    noise_manifest: null
+    manifest_filepath: ??? # path to training manifest, can be a string or list of strings
+    noise_manifest: ???  # the manifest for noise data, can be a string or list of strings
     sample_rate: ${model.sample_rate}
     batch_size: 8 # you may increase batch_size if your memory allows
     shuffle: true
diff --git a/examples/asr/run_helper.py b/examples/asr/run_helper.py
index 6e82f1f35ab1..2e6e810e51b1 100644
--- a/examples/asr/run_helper.py
+++ b/examples/asr/run_helper.py
@@ -82,6 +82,7 @@ def check_missing_values(cfg):
     check_missing_values(result)
     return result
 
+
 def check_config_mount_paths(script_config, cluster_config):
     # recursively walk all values of the script_config, checking if its a path-like string and if so, check if the path is a mounted path
     # if it is not, raise an error
@@ -154,7 +155,9 @@ def main(cluster_cfg):
         if 'exp_manager' in merged_config and 'name' in merged_config['exp_manager']:
             exp_name = merged_config['exp_manager']['name']
         else:
-            raise ValueError("Experiment name not provided in the run config file (`exp_name`)) or the cluster config (inside exp_manager.name)")
+            raise ValueError(
+                "Experiment name not provided in the run config file (`exp_name`)) or the cluster config (inside exp_manager.name)"
+            )
 
     with run.Experiment(exp_name) as exp:
         cmd = get_execution_script(cluster_script_path, "config.yaml")
@@ -166,7 +169,8 @@ def main(cluster_cfg):
         num_nodes = cluster_cfg.get('num_nodes', merged_config['trainer'].get('num_nodes', 1))
         cluster_cfg = OmegaConf.to_object(cluster_cfg)
 
-        run_utils.add_task(exp,
+        run_utils.add_task(
+            exp,
             cmd=cmd,
             task_name=job_name,
             cluster_config=cluster_cfg,
diff --git a/examples/asr/speech_pretraining/README.md b/examples/asr/speech_pretraining/README.md
index aeafcf69292b..777ea0602789 100644
--- a/examples/asr/speech_pretraining/README.md
+++ b/examples/asr/speech_pretraining/README.md
@@ -5,3 +5,11 @@ This directory contains example scripts to self-supervised speech models.
 There are two main types of supported self-supervised learning methods:
 - [Wav2vec-BERT](https://arxiv.org/abs/2108.06209): `speech_pre_training.py`
 - [NEST](https://arxiv.org/abs/2408.13106): `masked_token_pred_pretrain.py`
+    - For downstream tasks that use NEST as multi-layer feature extractor, please refer to `./downstream/speech_classification_mfa_train.py`
+
+
+For their corresponding usage, please refer to the example yaml config:
+- Wav2vec-BERT: `examples/asr/conf/ssl/fastconformer/fast-conformer.yaml`
+- NEST: `examples/asr/conf/ssl/nest/nest_fast-conformer.yaml`
+
+
diff --git a/examples/asr/speech_pretraining/masked_token_pred_pretrain.py b/examples/asr/speech_pretraining/masked_token_pred_pretrain.py
index 83729dfd9d67..1ea88d696643 100644
--- a/examples/asr/speech_pretraining/masked_token_pred_pretrain.py
+++ b/examples/asr/speech_pretraining/masked_token_pred_pretrain.py
@@ -28,7 +28,9 @@
 python pretrain_masked_token_pred.py \
     # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
     model.train_ds.manifest_filepath=<path to train manifest> \
+    model.train_ds.noise_manifest=<path to noise manifest> \
     model.validation_ds.manifest_filepath=<path to val/test manifest> \
+    model.validation_ds.noise_manifest=<path to noise manifest> \
     trainer.devices=-1 \
     trainer.accelerator="gpu" \
     strategy="ddp"  \
diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py
index e28fb4e69627..ec88bda34954 100644
--- a/examples/audio/process_audio.py
+++ b/examples/audio/process_audio.py
@@ -159,8 +159,8 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
     audio_to_audio_model.set_trainer(trainer)
     audio_to_audio_model = audio_to_audio_model.eval()
 
-    # override sampler
-    if cfg.sampler is not None:
+    # override sampler if necessary
+    if cfg.sampler:
         logging.info('Overriding sampler with %s', cfg.sampler)
 
         if hasattr(audio_to_audio_model, 'sampler'):
diff --git a/examples/llm/peft/hf.py b/examples/llm/peft/hf.py
new file mode 100644
index 000000000000..c6dbbf90bf29
--- /dev/null
+++ b/examples/llm/peft/hf.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+from pytorch_lightning.loggers import WandbLogger
+from nemo import lightning as nl
+from nemo.collections import llm
+
+
+def mk_hf_dataset(tokenizer):
+    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
+
+    def formatting_prompts_func(examples):
+        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+    ### Instruction:
+    {}
+
+    ### Input:
+    {}
+
+    ### Response:
+    {}"""
+        instruction = examples["context"]
+        input = examples["question"]
+        output = examples["answers"]['text']
+        if isinstance(output, list):
+            output = output[0]
+        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
+        ans = tokenizer(text)
+        tokens = ans['input_ids']
+        return {
+            'tokens': tokens,
+            'labels': tokens[1:] + [tokens[-1]],
+        }
+
+    from datasets import load_dataset
+
+    dataset = load_dataset("rajpurkar/squad", split="train")
+    dataset = dataset.map(formatting_prompts_func, batched=False, batch_size=2)
+    return dataset
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='meta-llama/Llama-3.2-1B')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument('--wandb-project', type=str, default=None)
+    args = parser.parse_args()
+
+    wandb = None
+    if args.wandb_project is not None:
+        model = '_'.join(args.model.split('/')[-2:])
+        wandb = WandbLogger(
+            project=args.wandb_project,
+            name=f'{model}_dev{args.devices}_strat_{args.strategy}',
+        )
+    grad_clip = 0.5
+    if args.strategy == 'fsdp':
+        # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
+        grad_clip = None
+    use_dist_samp = False
+    tokenizer = llm.HfAutoModelForCausalLM.configure_tokenizer(args.model)
+
+    llm.api.finetune(
+        model=llm.HfAutoModelForCausalLM(args.model),
+        data=llm.HfDatasetDataModule(
+            mk_hf_dataset(tokenizer.tokenizer), pad_token_id=tokenizer.tokenizer.eos_token_id
+        ),
+        trainer=nl.Trainer(
+            devices=args.devices,
+            max_steps=args.max_steps,
+            accelerator=args.accelerator,
+            strategy=args.strategy,
+            log_every_n_steps=1,
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+            accumulate_grad_batches=10,
+            gradient_clip_val=grad_clip,
+            use_distributed_sampler=use_dist_samp,
+            logger=wandb,
+        ),
+        optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(max_lr=1e-5, clip_grad=0.5)),
+        log=None,
+        peft=llm.peft.LoRA(
+            target_modules=['*_proj'],
+            dim=32,
+        ),
+    )
diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md
index c9bb7331f972..61f64d7792bb 100644
--- a/examples/llm/pretrain/README.md
+++ b/examples/llm/pretrain/README.md
@@ -3,7 +3,7 @@
 ### Listing the available recipes for pretraining
 
 ```bash
-nemorun llm pretrain --help
+nemo llm pretrain --help
 ```
 
 ![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png)
@@ -12,7 +12,7 @@ nemorun llm pretrain --help
 ### Run pre-training with a default recipe
 
 ```bash
-nemorun llm pretrain --factory llama3_8b
+nemo llm pretrain --factory llama3_8b
 ```
 
 ![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png)
@@ -20,7 +20,7 @@ nemorun llm pretrain --factory llama3_8b
 We can also call the factory function with custom parameters:
 
 ```bash
-nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
+nemo llm pretrain --factory "llama3_70b(num_nodes=128)"
 ```
 
 ![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png)
@@ -29,13 +29,13 @@ nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
 The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: 
 
 ```bash
-nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000
+nemo llm pretrain --factory llama3_70b trainer.max_steps=2000
 ```
 
 The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag.
 
 ```bash
-nemorun llm pretrain --factory llama3_70b --repl
+nemo llm pretrain --factory llama3_70b --repl
 ```
 
 ![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif)
diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py
new file mode 100644
index 000000000000..b7e12d8fb2de
--- /dev/null
+++ b/examples/llm/sft/hf.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from torch.utils.data import DataLoader
+
+from nemo import lightning as nl
+from nemo.collections import llm
+
+
+class SquadDataModuleWithPthDataloader(llm.SquadDataModule):
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            batch_size=self.micro_batch_size,
+            **kwargs,
+        )
+
+
+def squad(tokenizer) -> pl.LightningDataModule:
+    return SquadDataModuleWithPthDataloader(
+        tokenizer=tokenizer,
+        seq_length=2048,
+        micro_batch_size=2,
+        global_batch_size=128,  # assert gbs == mbs * accumulate_grad_batches
+        num_workers=0,
+        sanity_check_dist_workers=False,
+    )
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='meta-llama/Llama-3.2-1B')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument('--wandb-project', type=str, default=None)
+    args = parser.parse_args()
+
+    wandb = None
+    if args.wandb_project is not None:
+        model = '_'.join(args.model.split('/')[-2:])
+        wandb = WandbLogger(
+            project=args.wandb_project,
+            name=f'{model}_dev{args.devices}_strat_{args.strategy}',
+        )
+    grad_clip = 0.5
+    if args.strategy == 'fsdp':
+        # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
+        grad_clip = None
+    use_dist_samp = False
+
+    llm.api.finetune(
+        model=llm.HfAutoModelForCausalLM(args.model),
+        data=squad(llm.HfAutoModelForCausalLM.configure_tokenizer(args.model)),
+        trainer=nl.Trainer(
+            devices=args.devices,
+            max_steps=args.max_steps,
+            accelerator=args.accelerator,
+            strategy=args.strategy,
+            log_every_n_steps=1,
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+            accumulate_grad_batches=10,
+            gradient_clip_val=grad_clip,
+            use_distributed_sampler=use_dist_samp,
+            logger=wandb,
+        ),
+        optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(max_lr=1e-5, clip_grad=0.5)),
+        log=None,
+    )
diff --git a/examples/nlp/duplex_text_normalization/README.md b/examples/nlp/duplex_text_normalization/README.md
new file mode 100644
index 000000000000..808ed2856fb2
--- /dev/null
+++ b/examples/nlp/duplex_text_normalization/README.md
@@ -0,0 +1,2 @@
+> [!IMPORTANT]  
+> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release.
diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
index 40ba35f819ef..c81119489582 100644
--- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -21,7 +21,9 @@
      --checkpoint_name <checkpoint_name> \
      --nemo_file_path <path_to_output_nemo_file> \
      --tensor_model_parallel_size <tensor_model_parallel_size> \
-     --pipeline_model_parallel_size <pipeline_model_parallel_size>
+     --pipeline_model_parallel_size <pipeline_model_parallel_size> \
+     --gpus_per_node <gpus_per_node> \
+     --model_type <model_type>
 """
 
 import dis
@@ -100,7 +102,7 @@ def get_args():
         default="gpt",
         choices=["gpt", "sft", "t5", "bert", "nmt", "bart", "retro"],
     )
-    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+    parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
     parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform")
     parser.add_argument(
         "--precision",
@@ -134,7 +136,7 @@ def convert(local_rank, rank, world_size, args):
             'accelerator': 'gpu',
             'precision': args.precision,
         },
-        'model': {'native_amp_init_scale': 2 ** 32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
+        'model': {'native_amp_init_scale': 2**32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
     }
     cfg = OmegaConf.create(cfg)
 
@@ -142,7 +144,7 @@ def convert(local_rank, rank, world_size, args):
     # If FP16 create a GradScaler as the build_model_parallel_config of MegatronBaseModel expects it
     if cfg.trainer.precision == '16-mixed':
         scaler = GradScaler(
-            init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+            init_scale=cfg.model.get('native_amp_init_scale', 2**32),
             growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
             hysteresis=cfg.model.get('hysteresis', 2),
         )
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index 06551f46486c..79a07ce4e2c0 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -126,6 +126,13 @@ model:
       tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
 
   data:
+    chat: False # whether use chatbot data or not
+    chat_prompt_tokens:  # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '<im end><im start>', the '><' sometimes is merged to be a single token. This is not supported, try to avoid
+      system_turn_start: "\x00"
+      turn_start: "\x11"
+      label_start: "\x12"
+      end_of_turn: "\x0A"  # \0x0A is '\n'
+      end_of_name: "\x0A"  # \0x0A is '\n'
     train_ds:
       # Example of how to specify paths to multiple datasets
       # file_names:
diff --git a/examples/nlp/token_classification/README.md b/examples/nlp/token_classification/README.md
new file mode 100644
index 000000000000..808ed2856fb2
--- /dev/null
+++ b/examples/nlp/token_classification/README.md
@@ -0,0 +1,2 @@
+> [!IMPORTANT]  
+> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release.
diff --git a/examples/vlm/llava_next_energon_training.py b/examples/vlm/llava_next_energon_training.py
new file mode 100644
index 000000000000..fcc598886615
--- /dev/null
+++ b/examples/vlm/llava_next_energon_training.py
@@ -0,0 +1,162 @@
+import argparse
+import os
+import sys
+
+import requests
+import torch
+from megatron.core.optimizer import OptimizerConfig
+from megatron.energon import VQASample
+from PIL import Image
+from transformers import AutoProcessor
+
+from nemo import lightning as nl
+from nemo.collections import llm, vlm
+from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule
+from nemo.collections.multimodal.data.energon.config import MultiModalSampleConfig
+from nemo.collections.vlm import ImageDataConfig, Llava1_5Config7B, LlavaModel, LlavaNextTaskEncoder
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+from nemo.utils import logging
+from nemo.utils.exp_manager import TimingCallback
+
+
+def main(args):
+    # Global and micro batch sizes
+    gbs = 32
+    mbs = 4
+    seq_length = 256
+
+    processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+    data_path = args.data_path
+    # data_path = '/lustre/fsw/coreai_dlalgo_genai/datasets/energon_datasets/LLaVA-Pretrain-LCS-558K'
+    image_processor = processor.image_processor
+    tokenizer = processor.tokenizer
+
+    multimodal_sample_config = MultiModalSampleConfig()
+
+    task_encoder = LlavaNextTaskEncoder(
+        tokenizer=tokenizer, image_processor=image_processor, multimodal_sample_config=multimodal_sample_config
+    )
+    data = SimpleMultiModalDataModule(
+        path=data_path,
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        num_workers=8,
+        micro_batch_size=mbs,
+        global_batch_size=gbs,
+        multimodal_sample_config=multimodal_sample_config,
+        task_encoder=task_encoder,
+    )
+
+    # Training strategy setup
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=args.tp_size,
+        pipeline_model_parallel_size=args.pp_size,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    # Checkpoint callback setup
+    checkpoint_callback = nl.ModelCheckpoint(
+        save_last=True,
+        monitor="reduced_train_loss",
+        save_top_k=2,
+        every_n_train_steps=500,
+        dirpath=args.log_dir,
+    )
+
+    trainer = nl.Trainer(
+        devices=args.devices,
+        max_steps=10000,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        callbacks=[checkpoint_callback, TimingCallback()],
+        val_check_interval=100,
+        limit_val_batches=gbs,
+        log_every_n_steps=1,
+        num_sanity_val_steps=0,
+    )
+
+    language_transformer_config = llm.Llama2Config7B()
+    vision_transformer_config = vlm.HFCLIPVisionConfig(
+        pretrained_model_name_or_path="openai/clip-vit-large-patch14-336"
+    )
+    vision_projection_config = vlm.MultimodalProjectorConfig(
+        projector_type=args.projector_type,
+        input_size=1024,
+        hidden_size=4096,
+        ffn_hidden_size=4096,
+    )
+
+    # NEVA model configuration
+    neva_config = vlm.NevaConfig(
+        language_transformer_config=language_transformer_config,
+        vision_transformer_config=vision_transformer_config,
+        vision_projection_config=vision_projection_config,
+        language_model_from_pretrained=args.language_model_path,
+        freeze_language_model=True,
+    )
+
+    model = vlm.NevaModel(neva_config, tokenizer=data.tokenizer)
+
+    # Logger setup
+    from pytorch_lightning.loggers import WandbLogger
+
+    nemo_logger = nl.NeMoLogger(
+        log_dir=args.log_dir,
+        name=args.name,
+        wandb=WandbLogger(project=args.wandb_project, name=args.name) if args.wandb_project is not None else None,
+    )
+    nemo_logger.setup(
+        trainer,
+        resume_if_exists=True,
+    )
+
+    # Auto resume setup
+    resume = nl.AutoResume(
+        resume_if_exists=True,
+        resume_ignore_no_checkpoint=True,
+        resume_from_directory=args.log_dir,
+        restore_config=nl.RestoreConfig(path=args.restore_path) if args.restore_path is not None else None,
+    )
+    resume.setup(trainer, model)
+
+    # Optimizer and scheduler setup
+    opt_config = OptimizerConfig(
+        optimizer='adam',
+        lr=0.001,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        use_distributed_optimizer=False,
+        bf16=True,
+    )
+    sched = CosineAnnealingScheduler(
+        max_steps=trainer.max_steps,
+        warmup_steps=70,
+        constant_steps=0,
+        min_lr=2.0e-05,
+    )
+    opt = MegatronOptimizerModule(opt_config, sched)
+    opt.connect(model)
+
+    trainer.fit(model, data)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="NEVA Model Training Script")
+
+    # Argument parsing
+    parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset JSON file")
+    parser.add_argument("--log_dir", type=str, required=True, help="Directory for logging and checkpoints")
+    parser.add_argument("--devices", type=int, required=False, default=1)
+    parser.add_argument("--tp_size", type=int, required=False, default=1)
+    parser.add_argument("--pp_size", type=int, required=False, default=1)
+    parser.add_argument(
+        "--language_model_path", type=str, required=False, default=None, help="Path to the pretrained language model"
+    )
+    parser.add_argument("--name", type=str, required=False, default="llava_next_test")
+    parser.add_argument("--wandb_project", type=str, required=False, default=None)
+    parser.add_argument("--projector_type", type=str, required=False, default="mlp2x_gelu")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/vlm/llava_next_finetune_energon.py b/examples/vlm/llava_next_finetune_energon.py
new file mode 100644
index 000000000000..17a53df6ec92
--- /dev/null
+++ b/examples/vlm/llava_next_finetune_energon.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+from megatron.core.optimizer import OptimizerConfig
+from transformers import AutoProcessor
+
+from nemo import lightning as nl
+from nemo.collections import llm, vlm
+from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule
+from nemo.collections.multimodal.data.energon.config import MultiModalSampleConfig
+from nemo.collections.vlm import ImageDataConfig, LlavaNextTaskEncoder
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+
+
+def main(args):
+    # Global and micro batch sizes
+    gbs = 128
+    mbs = 4
+    seq_length = 4096
+
+    # Data configuration
+    # data_config = ImageDataConfig(
+    #     image_folder=args.image_folder,
+    #     conv_template="v1",
+    # )
+
+    # Data module setup
+    # data = vlm.NevaLazyDataModule(
+    #     paths=args.data_path,
+    #     data_config=data_config,
+    #     seq_length=seq_length,
+    #     global_batch_size=gbs,
+    #     micro_batch_size=mbs,
+    #     tokenizer=None,
+    #     image_processor=None,
+    #     num_workers=8,
+    # )
+
+    # energon data config
+    processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+    data_path = args.data_path
+    image_processor = processor.image_processor
+    tokenizer = processor.tokenizer
+
+    multimodal_sample_config = MultiModalSampleConfig()
+
+    task_encoder = LlavaNextTaskEncoder(
+        tokenizer=tokenizer, image_processor=image_processor, multimodal_sample_config=multimodal_sample_config
+    )
+    data = SimpleMultiModalDataModule(
+        path=data_path,
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        num_workers=8,
+        micro_batch_size=mbs,
+        global_batch_size=gbs,
+        multimodal_sample_config=multimodal_sample_config,
+        task_encoder=task_encoder,
+    )
+
+    # Transformer configurations
+    language_transformer_config = llm.Llama2Config7B()
+    vision_transformer_config = vlm.HFCLIPVisionConfig(
+        pretrained_model_name_or_path="openai/clip-vit-large-patch14-336"
+    )
+    vision_projection_config = vlm.MultimodalProjectorConfig(
+        projector_type=args.projector_type,
+        input_size=1024,
+        hidden_size=4096,
+        ffn_hidden_size=4096,
+    )
+
+    # NEVA model configuration
+    neva_config = vlm.NevaConfig(
+        language_transformer_config=language_transformer_config,
+        vision_transformer_config=vision_transformer_config,
+        vision_projection_config=vision_projection_config,
+        language_model_from_pretrained=args.language_model_path,
+        freeze_language_model=False,
+    )
+
+    model = vlm.NevaModel(neva_config, tokenizer=data.tokenizer)
+
+    # Training strategy setup
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=args.tp_size,
+        pipeline_model_parallel_size=1,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    # Checkpoint callback setup
+    checkpoint_callback = nl.ModelCheckpoint(
+        save_last=True,
+        monitor="reduced_train_loss",
+        save_top_k=2,
+        every_n_train_steps=10,
+        dirpath=args.log_dir,
+    )
+
+    # Trainer setup
+    trainer = nl.Trainer(
+        devices=args.devices,
+        max_steps=5190,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        callbacks=[checkpoint_callback],
+        val_check_interval=1000,
+        limit_val_batches=gbs,
+        log_every_n_steps=1,
+        num_sanity_val_steps=0,
+    )
+
+    # Logger setup
+    from pytorch_lightning.loggers import WandbLogger
+
+    nemo_logger = nl.NeMoLogger(
+        log_dir=args.log_dir,
+        name=args.name,
+        wandb=WandbLogger(project=args.wandb_project, name=args.name) if args.wandb_project is not None else None,
+    )
+    nemo_logger.setup(
+        trainer,
+        resume_if_exists=True,
+    )
+
+    # Auto resume setup
+    from nemo.lightning.pytorch.strategies.utils import RestoreConfig
+
+    resume = nl.AutoResume(
+        resume_if_exists=False,
+        resume_ignore_no_checkpoint=True,
+        resume_from_directory=args.log_dir,
+        restore_config=(
+            RestoreConfig(
+                path=args.restore_path,
+            )
+            if args.restore_path is not None
+            else None
+        ),
+    )
+    resume.setup(trainer, model)
+
+    # Optimizer and scheduler setup
+    opt_config = OptimizerConfig(
+        optimizer='adam',
+        lr=2.0e-05,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        use_distributed_optimizer=False,
+        bf16=True,
+    )
+    sched = CosineAnnealingScheduler(
+        max_steps=trainer.max_steps,
+        warmup_steps=150,
+        constant_steps=0,
+        min_lr=2.0e-07,
+    )
+    opt = MegatronOptimizerModule(opt_config, sched)
+    opt.connect(model)
+
+    # Start training
+    trainer.fit(model, data)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="NEVA Model Training Script")
+
+    # Argument parsing
+    parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset JSON file")
+    parser.add_argument("--image_folder", type=str, required=True, help="Path to the image folder")
+    parser.add_argument("--log_dir", type=str, required=True, help="Directory for logging and checkpoints")
+    parser.add_argument(
+        "--language_model_path", type=str, required=False, default=None, help="Path to the pretrained language model"
+    )
+    parser.add_argument(
+        "--restore_path", type=str, required=False, default=None, help="Path to restore model from checkpoint"
+    )
+    parser.add_argument("--devices", type=int, required=False, default=4)
+    parser.add_argument("--tp_size", type=int, required=False, default=4)
+    parser.add_argument("--projector_type", type=str, required=False, default="mlp2x_gelu")
+    parser.add_argument("--name", type=str, required=False, default="neva_finetune")
+    parser.add_argument("--wandb_project", type=str, required=False, default=None)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/vlm/llava_next_generation_example.py b/examples/vlm/llava_next_generation_example.py
new file mode 100644
index 000000000000..de81c1cc181b
--- /dev/null
+++ b/examples/vlm/llava_next_generation_example.py
@@ -0,0 +1,146 @@
+import argparse
+import os
+import sys
+
+from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule
+from nemo.utils import logging
+
+logging.setLevel(logging.DEBUG)
+import requests
+import torch
+from megatron.energon import VQASample
+from PIL import Image
+from transformers import AutoProcessor
+
+from nemo import lightning as nl
+from nemo.collections.multimodal.data.energon.config import MultiModalSampleConfig
+from nemo.collections.vlm import Llava1_5Config7B, LlavaModel, LlavaNextTaskEncoder
+from nemo.utils import logging
+
+
+def load_image(image_url: str) -> Image.Image:
+    try:
+        response = requests.get(image_url, stream=True)
+        response.raise_for_status()
+        image = Image.open(response.raw)
+        return image
+    except requests.exceptions.RequestException as e:
+        print(f"Error loading image from {image_url}: {e}")
+        return None
+
+
+def main(args) -> None:
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=1,
+        ckpt_include_optimizer=False,
+    )
+    trainer = nl.Trainer(
+        devices=1,
+        max_steps=1000,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        val_check_interval=1000,
+        limit_val_batches=50,
+    )
+
+    # Tokenize the input texts
+    # processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+    processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf")
+    data_path = '/home/ykarnati/Downloads/LLaVA-Pretrain/wds'
+    image_processor = processor.image_processor
+    tokenizer = processor.tokenizer
+
+    multimodal_sample_config = MultiModalSampleConfig()
+
+    task_encoder = LlavaNextTaskEncoder(
+        tokenizer=tokenizer, image_processor=image_processor, multimodal_sample_config=multimodal_sample_config
+    )
+    data_module = SimpleMultiModalDataModule(
+        path=data_path,
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        num_workers=0,
+        micro_batch_size=1,
+        multimodal_sample_config=multimodal_sample_config,
+        task_encoder=task_encoder,
+    )
+
+    train_loader = data_module.train_dataloader()
+    one_batch = next(iter(train_loader))
+
+    fabric = trainer.to_fabric()
+
+    # Decide whether to import or load the model based on the input arguments
+    if args.load_from_hf:
+        # model = fabric.import_model("hf://llava-hf/llava-1.5-7b-hf", LlavaModel)
+        model = fabric.import_model("hf://llava-hf/llava-v1.6-vicuna-7b-hf", LlavaModel)
+        #
+    else:
+        model = LlavaModel(Llava1_5Config7B(), tokenizer=tokenizer)
+        model = fabric.load_model(args.local_model_path, model)
+
+    model = model.module.cuda()
+    model.eval()
+
+    # Greedy generation loop
+    media = one_batch["media"].cuda()
+    input_ids = one_batch["tokens"].cuda()
+    position_ids = one_batch["position_ids"].cuda()
+    num_media_tiles = one_batch["num_media_tiles"]
+    generated_ids = input_ids.clone()
+    for _ in range(20):
+        with torch.no_grad():
+
+            output = model(
+                media=media,
+                input_ids=input_ids,
+                position_ids=position_ids,
+                attention_mask=None,
+                num_media_tiles=num_media_tiles,
+            )
+
+            next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True)
+
+            generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
+
+            input_ids = generated_ids
+            position_ids = (
+                torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device)
+                .unsqueeze(0)
+                .expand_as(input_ids)
+            )
+
+            # If the generated token is the end of sequence token, stop generating
+            if next_token_ids.item() == tokenizer.eos_token_id:
+                break
+
+    generated_ids[generated_ids == -200] = 0
+    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
+    logging.info("======== GENERATED TEXT OUTPUT ========")
+    logging.info(f"{generated_texts}")
+    logging.info("=======================================")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="LLaVA Multimodal Inference")
+    parser.add_argument(
+        "--load_from_hf",
+        action="store_true",
+        help="Flag to indicate whether to load the model from Hugging Face hub.",
+    )
+    parser.add_argument(
+        "--local_model_path",
+        type=str,
+        default=None,
+        help="Local path to the model if not loading from Hugging Face.",
+    )
+    parser.add_argument(
+        "--image_url",
+        type=str,
+        default="http://images.cocodataset.org/val2017/000000039769.jpg",
+        help="URL of the image to use for inference.",
+    )
+    args = parser.parse_args()
+
+    main(args)
diff --git a/nemo/collections/asr/models/ssl_models.py b/nemo/collections/asr/models/ssl_models.py
index 5424ed79e751..633a00d73f5e 100644
--- a/nemo/collections/asr/models/ssl_models.py
+++ b/nemo/collections/asr/models/ssl_models.py
@@ -996,7 +996,12 @@ def training_step(self, batch: ssl_dataset.AudioNoiseBatch, batch_idx: int):
         return {'loss': loss_value, 'log': tensorboard_logs}
 
     def inference_pass(
-        self, batch: ssl_dataset.AudioNoiseBatch, batch_idx: int, dataloader_idx: int = 0, mode: str = 'val'
+        self,
+        batch: ssl_dataset.AudioNoiseBatch,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+        mode: str = 'val',
+        apply_mask: bool = True,
     ):
         log_probs, encoded_len, masks, tokens = self.forward(
             input_signal=batch.audio,
@@ -1005,7 +1010,7 @@ def inference_pass(
             noise_signal_length=batch.noise_len,
             noisy_input_signal=batch.noisy_audio,
             noisy_input_signal_length=batch.noisy_audio_len,
-            apply_mask=True,
+            apply_mask=apply_mask,
         )
 
         loss_value = self.loss(masks=masks, decoder_outputs=log_probs, targets=tokens, decoder_lengths=encoded_len)
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index c1e712c44aeb..0d4f4c895bcf 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -314,7 +314,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
         with NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
             for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=cfg.presort_manifest):
                 audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest)
-                item[audio_key] = audio_file
+                item['audio_filepath'] = audio_file
                 filepaths.append(audio_file)
                 f.write(json.dumps(item) + "\n")
         sorted_manifest_path = f.name
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index 76dca1268c3b..439322b8e810 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from collections import OrderedDict
-from typing import Optional
+from typing import List, Optional
 
 from transformers import AutoTokenizer as AUTOTOKENIZER
 
@@ -43,6 +43,7 @@ def __init__(
         sep_token: Optional[str] = None,
         cls_token: Optional[str] = None,
         unk_token: Optional[str] = None,
+        additional_special_tokens: Optional[List] = [],
         use_fast: Optional[bool] = False,
         trust_remote_code: Optional[bool] = False,
     ):
@@ -60,6 +61,7 @@ def __init__(
             sep_token: token used for separating sequences
             cls_token: class token. Usually equal to bos_token
             unk_token: token to use for unknown tokens
+            additional_special_tokens: list of other tokens beside standard special tokens (bos, eos, pad, etc.). For example, sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.)
             use_fast: whether to use fast HuggingFace tokenizer
         """
         try:
@@ -124,10 +126,17 @@ def __init__(
         elif self.tokenizer.cls_token is None and self.tokenizer.bos_token:
             special_tokens_dict["cls_token"] = self.tokenizer.bos_token
 
+        # add additional special tokens (not standard special tokens such as bos, eod, sep)
+        if additional_special_tokens is not None:
+            special_tokens_dict["additional_special_tokens"] = additional_special_tokens
+
         new_tokens_in_vocab = []
         for token in [mask_token, bos_token, eos_token, pad_token, sep_token, cls_token, unk_token]:
             if token is not None and token not in self.tokenizer.get_vocab():
                 new_tokens_in_vocab.append(token)
+        for token in additional_special_tokens:
+            if token is not None and token not in self.tokenizer.get_vocab():
+                new_tokens_in_vocab.append(token)
 
         if len(new_tokens_in_vocab) > 0:
             """
diff --git a/nemo/collections/diffusion/encoders/__init__.py b/nemo/collections/diffusion/encoders/__init__.py
new file mode 100644
index 000000000000..9e3250071955
--- /dev/null
+++ b/nemo/collections/diffusion/encoders/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/encoders/conditioner.py b/nemo/collections/diffusion/encoders/conditioner.py
new file mode 100644
index 000000000000..2bfb008c5d84
--- /dev/null
+++ b/nemo/collections/diffusion/encoders/conditioner.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+import torch
+import torch.nn as nn
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
+
+
+class AbstractEmbModel(nn.Module):
+    def __init__(self, enable_lora_finetune=False, target_block=[], target_module=[]):
+        super().__init__()
+        self._is_trainable = None
+        self._ucg_rate = None
+        self._input_key = None
+
+        self.TARGET_BLOCK = target_block
+        self.TARGET_MODULE = target_module
+        if enable_lora_finetune:
+            self.lora_layers = []
+
+    @property
+    def is_trainable(self) -> bool:
+        return self._is_trainable
+
+    @property
+    def ucg_rate(self) -> Union[float, torch.Tensor]:
+        return self._ucg_rate
+
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+
+    @is_trainable.setter
+    def is_trainable(self, value: bool):
+        self._is_trainable = value
+
+    @ucg_rate.setter
+    def ucg_rate(self, value: Union[float, torch.Tensor]):
+        self._ucg_rate = value
+
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+
+    @is_trainable.deleter
+    def is_trainable(self):
+        del self._is_trainable
+
+    @ucg_rate.deleter
+    def ucg_rate(self):
+        del self._ucg_rate
+
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def _enable_lora(self, lora_model):
+        for module_name, module in lora_model.named_modules():
+            if module.__class__.__name__ in self.TARGET_BLOCK:
+                tmp = {}
+                for sub_name, sub_module in module.named_modules():
+                    if sub_module.__class__.__name__ in self.TARGET_MODULE:
+                        if hasattr(sub_module, "input_size") and hasattr(
+                            sub_module, "output_size"
+                        ):  # for megatron ParallelLinear
+                            lora = LoraWrapper(sub_module, sub_module.input_size, sub_module.output_size)
+                        else:  # for nn.Linear
+                            lora = LoraWrapper(sub_module, sub_module.in_features, sub_module.out_features)
+                        self.lora_layers.append(lora)
+                        if sub_name not in tmp.keys():
+                            tmp.update({sub_name: lora})
+                        else:
+                            print(f"Duplicate subnames are found in module {module_name}")
+                for sub_name, lora_layer in tmp.items():
+                    lora_name = f'{sub_name}_lora'
+                    module.add_module(lora_name, lora_layer)
+
+
+class FrozenCLIPEmbedder(AbstractEmbModel):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+
+    LAYERS = ["last", "pooled", "hidden"]
+
+    def __init__(
+        self,
+        version="openai/clip-vit-large-patch14",
+        device="cuda",
+        max_length=77,
+        enable_lora_finetune=False,
+        layer="last",
+        layer_idx=None,
+        always_return_pooled=False,
+        dtype=torch.float,
+    ):
+        super().__init__(enable_lora_finetune, target_block=["CLIPAttention", "CLIPMLP"], target_module=["Linear"])
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.transformer = CLIPTextModel.from_pretrained(version, torch_dtype=dtype).to(device)
+        self.device = device
+        self.max_length = max_length
+        self.freeze()
+        if enable_lora_finetune:
+            self._enable_lora(self.transformer)
+            print(f"CLIP transformer encoder add {len(self.lora_layers)} lora layers.")
+
+        self.layer = layer
+        self.layer_idx = layer_idx
+        self.return_pooled = always_return_pooled
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text, max_sequence_length=None):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=max_sequence_length if max_sequence_length else self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        tokens = batch_encoding["input_ids"].to(self.transformer.device, non_blocking=True)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=(self.layer == "hidden"))
+
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+
+        # Pad the seq length to multiple of 8
+        seq_len = (z.shape[1] + 8 - 1) // 8 * 8
+        z = torch.nn.functional.pad(z, (0, 0, 0, seq_len - z.shape[1]), value=0.0)
+        if self.return_pooled:
+            return z, outputs.pooler_output
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenT5Embedder(AbstractEmbModel):
+    def __init__(
+        self,
+        version="google/t5-v1_1-xxl",
+        max_length=512,
+        device="cuda",
+        dtype=torch.float,
+    ):
+        super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl", max_length=max_length)
+        self.transformer = T5EncoderModel.from_pretrained(version, torch_dtype=dtype).to(device)
+        self.max_length = max_length
+        self.freeze()
+        self.device = device
+        self.dtype = dtype
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text, max_sequence_length=None):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=max_sequence_length if max_sequence_length else self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+
+        tokens = batch_encoding["input_ids"].to(self.transformer.device, non_blocking=True)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=None)
+
+        return outputs.last_hidden_state
diff --git a/nemo/collections/diffusion/flux_infer.py b/nemo/collections/diffusion/flux_infer.py
new file mode 100644
index 000000000000..f914dbf50258
--- /dev/null
+++ b/nemo/collections/diffusion/flux_infer.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+
+from nemo.collections.diffusion.models.flux.pipeline import FluxInferencePipeline
+from nemo.collections.diffusion.utils.flux_pipeline_utils import configs
+from nemo.collections.diffusion.utils.mcore_parallel_utils import Utils
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="The flux inference pipeline is utilizing megatron core transformer.\nPlease prepare the necessary checkpoints for flux model on local disk in order to use this script"
+    )
+
+    parser.add_argument("--flux_ckpt", type=str, default="", help="Path to Flux transformer checkpoint(s)")
+    parser.add_argument("--vae_ckpt", type=str, default="/ckpts/ae.safetensors", help="Path to \'ae.safetensors\'")
+    parser.add_argument(
+        "--clip_version",
+        type=str,
+        default='/ckpts/text_encoder',
+        help="Clip version, provide either ckpt dir or clip version like openai/clip-vit-large-patch14",
+    )
+    parser.add_argument(
+        "--t5_version",
+        type=str,
+        default='/ckpts/text_encoder_2',
+        help="Clip version, provide either ckpt dir or clip version like google/t5-v1_1-xxl",
+    )
+    parser.add_argument(
+        "--do_convert_from_hf",
+        action='store_true',
+        default=False,
+        help="Must be true if provided checkpoint is not already converted to NeMo version",
+    )
+    parser.add_argument(
+        "--save_converted_model",
+        action="store_true",
+        default=False,
+        help="Whether to save the converted NeMo transformer checkpoint for Flux",
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        default='dev',
+        choices=['dev', 'schnell'],
+        help="Must align with the checkpoint provided.",
+    )
+    parser.add_argument("--height", type=int, default=1024, help="Image height.")
+    parser.add_argument("--width", type=int, default=1024, help="Image width.")
+    parser.add_argument("--inference_steps", type=int, default=10, help="Number of inference steps to run.")
+    parser.add_argument(
+        "--num_images_per_prompt", type=int, default=1, help="Number of images to generate for each prompt."
+    )
+    parser.add_argument("--guidance", type=float, default=0.0, help="Guidance scale.")
+    parser.add_argument(
+        "--offload", action='store_true', default=False, help="Offload modules to cpu after being called."
+    )
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        default="A cat holding a sign that says hello world",
+        help="Inference prompts, use \',\' to separate if multiple prompts are provided.",
+    )
+    parser.add_argument("--bf16", action='store_true', default=False, help="Use bf16 in inference.")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Initializing model parallel config')
+    Utils.initialize_distributed(1, 1, 1)
+
+    print('Initializing flux inference pipeline')
+    params = configs[args.version]
+    params.vae_params.ckpt = args.vae_ckpt
+    params.clip_params['version'] = args.clip_version
+    params.t5_params['version'] = args.t5_version
+    pipe = FluxInferencePipeline(params)
+
+    print('Loading transformer weights')
+    pipe.load_from_pretrained(
+        args.flux_ckpt,
+        do_convert_from_hf=args.do_convert_from_hf,
+        save_converted_model=args.save_converted_model,
+    )
+    dtype = torch.bfloat16 if args.bf16 else torch.float32
+    text = args.prompts.split(',')
+    pipe(
+        text,
+        max_sequence_length=256,
+        height=args.height,
+        width=args.width,
+        num_inference_steps=args.inference_steps,
+        num_images_per_prompt=args.num_images_per_prompt,
+        offload=args.offload,
+        guidance_scale=args.guidance,
+        dtype=dtype,
+    )
diff --git a/nemo/collections/diffusion/models/dit/dit_attention.py b/nemo/collections/diffusion/models/dit/dit_attention.py
new file mode 100644
index 000000000000..9e60b11dd1c6
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit/dit_attention.py
@@ -0,0 +1,428 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
+from megatron.core.transformer.attention import Attention, SelfAttention
+from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+@dataclass
+class JointSelfAttentionSubmodules:
+    linear_qkv: Union[ModuleSpec, type] = None
+    added_linear_qkv: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    q_layernorm: Union[ModuleSpec, type] = None
+    k_layernorm: Union[ModuleSpec, type] = None
+    added_q_layernorm: Union[ModuleSpec, type] = None
+    added_k_layernorm: Union[ModuleSpec, type] = None
+
+
+class JointSelfAttention(Attention):
+    """Joint Self-attention layer class
+
+    Used for MMDIT-like transformer block.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: JointSelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+        context_pre_only: bool = False,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="self",
+        )
+
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear or self.config.add_qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='qkv',
+        )
+
+        if submodules.added_linear_qkv is not None:
+            self.added_linear_qkv = build_module(
+                submodules.added_linear_qkv,
+                self.config.hidden_size,
+                self.query_projection_size + 2 * self.kv_projection_size,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=self.config.add_qkv_bias,
+                skip_bias_add=False,
+                is_expert=False,
+                tp_comm_buffer_name='qkv',
+            )
+
+        if not context_pre_only:
+            self.added_linear_proj = build_module(
+                submodules.linear_proj,
+                self.query_projection_size,
+                self.config.hidden_size,
+                config=self.config,
+                init_method=self.config.output_layer_init_method,
+                bias=self.config.add_bias_linear,
+                input_is_parallel=True,
+                skip_bias_add=True,
+                is_expert=False,
+                tp_comm_buffer_name='proj',
+            )
+
+        if submodules.q_layernorm is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.q_layernorm = None
+
+        if submodules.k_layernorm is not None:
+            self.k_layernorm = build_module(
+                submodules.k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.k_layernorm = None
+
+        if submodules.added_q_layernorm is not None:
+            self.added_q_layernorm = build_module(
+                submodules.added_q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.added_q_layernorm = None
+
+        if submodules.added_k_layernorm is not None:
+            self.added_k_layernorm = build_module(
+                submodules.added_k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.added_k_layernorm = None
+
+    def _split_qkv(self, mixed_qkv):
+        # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+        new_tensor_shape = mixed_qkv.size()[:-1] + (
+            self.num_query_groups_per_partition,
+            (
+                (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                * self.hidden_size_per_attention_head
+            ),
+        )
+        mixed_qkv = mixed_qkv.view(*new_tensor_shape)
+
+        split_arg_list = [
+            (
+                self.num_attention_heads_per_partition
+                // self.num_query_groups_per_partition
+                * self.hidden_size_per_attention_head
+            ),
+            self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head,
+        ]
+
+        if SplitAlongDim is not None:
+
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = SplitAlongDim(
+                mixed_qkv,
+                3,
+                split_arg_list,
+            )
+        else:
+
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = torch.split(
+                mixed_qkv,
+                split_arg_list,
+                dim=3,
+            )
+
+        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
+        return query, key, value
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.linear_qkv(hidden_states)
+
+        query, key, value = self._split_qkv(mixed_qkv)
+
+        if self.config.test_mode:
+            self.run_realtime_tests()
+
+        if self.q_layernorm is not None:
+            query = self.q_layernorm(query)
+
+        if self.k_layernorm is not None:
+            key = self.k_layernorm(key)
+
+        return query, key, value
+
+    def get_added_query_key_value_tensors(self, added_hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.added_linear_qkv(added_hidden_states)
+
+        query, key, value = self._split_qkv(mixed_qkv)
+
+        if self.config.test_mode:
+            self.run_realtime_tests()
+
+        if self.added_q_layernorm is not None:
+            query = self.added_q_layernorm(query)
+
+        if self.added_k_layernorm is not None:
+            key = self.added_k_layernorm(key)
+
+        return query, key, value
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+        additional_hidden_states=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+
+        query, key, value = self.get_query_key_value_tensors(hidden_states)
+        added_query, added_key, added_value = self.get_added_query_key_value_tensors(additional_hidden_states)
+
+        query = torch.cat([added_query, query], dim=0)
+        key = torch.cat([added_key, key], dim=0)
+        value = torch.cat([added_value, value], dim=0)
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+
+            if packed_seq_params is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+            else:
+                cu_seqlens_q = cu_seqlens_kv = None
+            query = apply_rotary_pos_emb(
+                query,
+                q_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_q,
+            )
+            key = apply_rotary_pos_emb(
+                key,
+                k_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_kv,
+            )
+
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        encoder_attention_output = core_attn_out[: additional_hidden_states.shape[0], :, :]
+        attention_output = core_attn_out[additional_hidden_states.shape[0] :, :, :]
+
+        output, bias = self.linear_proj(attention_output)
+        encoder_output, encoder_bias = self.added_linear_proj(encoder_attention_output)
+
+        output = output + bias
+        encoder_output = encoder_output + encoder_bias
+
+        return output, encoder_output
+
+
+class FluxSingleAttention(SelfAttention):
+    """Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states)
+        # print(f'megatron q before ln: {query.transpose(0, 1).contiguous()}, {query.transpose(0, 1).contiguous().shape}')
+        # print(f'megatron k before ln: {key.transpose(0, 1).contiguous()}, {key.transpose(0, 1).contiguous().shape}')
+        # print(f'megatron v before ln: {value.transpose(0, 1).contiguous()}, {value.transpose(0, 1).contiguous().shape}')
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+
+            if packed_seq_params is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+            else:
+                cu_seqlens_q = cu_seqlens_kv = None
+            query = apply_rotary_pos_emb(
+                query,
+                q_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_q,
+            )
+            key = apply_rotary_pos_emb(
+                key,
+                k_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_kv,
+            )
+
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        return core_attn_out
diff --git a/nemo/collections/diffusion/models/dit/dit_layer_spec.py b/nemo/collections/diffusion/models/dit/dit_layer_spec.py
index 672dcff3ba00..cb7c520493f0 100644
--- a/nemo/collections/diffusion/models/dit/dit_layer_spec.py
+++ b/nemo/collections/diffusion/models/dit/dit_layer_spec.py
@@ -42,6 +42,12 @@
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 from megatron.core.utils import make_viewless_tensor
 
+from nemo.collections.diffusion.models.dit.dit_attention import (
+    FluxSingleAttention,
+    JointSelfAttention,
+    JointSelfAttentionSubmodules,
+)
+
 
 @dataclass
 class DiTWithAdaLNSubmodules(TransformerLayerSubmodules):
@@ -75,7 +81,14 @@ class AdaLN(MegatronModule):
     Adaptive Layer Normalization Module for DiT.
     """
 
-    def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNorm):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        n_adaln_chunks=9,
+        norm=nn.LayerNorm,
+        modulation_bias=False,
+        use_second_norm=False,
+    ):
         super().__init__(config)
         if norm == TENorm:
             self.ln = norm(config, config.hidden_size, config.layernorm_epsilon)
@@ -83,8 +96,11 @@ def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNor
             self.ln = norm(config.hidden_size, elementwise_affine=False, eps=self.config.layernorm_epsilon)
         self.n_adaln_chunks = n_adaln_chunks
         self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=False)
+            nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=modulation_bias)
         )
+        self.use_second_norm = use_second_norm
+        if self.use_second_norm:
+            self.ln2 = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6)
         nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
 
         setattr(self.adaLN_modulation[-1].weight, "sequence_parallel", config.sequence_parallel)
@@ -92,29 +108,59 @@ def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNor
     def forward(self, timestep_emb):
         return self.adaLN_modulation(timestep_emb).chunk(self.n_adaln_chunks, dim=-1)
 
-    @jit_fuser
+    # @jit_fuser
     def modulate(self, x, shift, scale):
         return x * (1 + scale) + shift
 
-    @jit_fuser
+    # @jit_fuser
     def scale_add(self, residual, x, gate):
         return residual + gate * x
 
-    @jit_fuser
-    def modulated_layernorm(self, x, shift, scale):
+    # @jit_fuser
+    def modulated_layernorm(self, x, shift, scale, layernorm_idx=0):
+        if self.use_second_norm and layernorm_idx == 1:
+            layernorm = self.ln2
+        else:
+            layernorm = self.ln
         # Optional Input Layer norm
-        input_layernorm_output = self.ln(x).type_as(x)
+        input_layernorm_output = layernorm(x).type_as(x)
 
         # DiT block specific
         return self.modulate(input_layernorm_output, shift, scale)
 
     # @jit_fuser
-    def scaled_modulated_layernorm(self, residual, x, gate, shift, scale):
+    def scaled_modulated_layernorm(self, residual, x, gate, shift, scale, layernorm_idx=0):
         hidden_states = self.scale_add(residual, x, gate)
-        shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale)
+        shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale, layernorm_idx)
         return hidden_states, shifted_pre_mlp_layernorm_output
 
 
+class AdaLNContinuous(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        conditioning_embedding_dim: int,
+        modulation_bias: bool = True,
+        norm_type: str = "layer_norm",
+    ):
+        super().__init__(config)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(conditioning_embedding_dim, config.hidden_size * 2, bias=modulation_bias)
+        )
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6, bias=modulation_bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(config.hidden_size, eps=1e-6)
+        else:
+            raise ValueError("Unknown normalization type {}".format(norm_type))
+
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        emb = self.adaLN_modulation(conditioning_embedding)
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+
+
 class STDiTLayerWithAdaLN(TransformerLayer):
     """A single transformer layer.
 
@@ -407,6 +453,225 @@ def forward(
         return output, context
 
 
+class DiTLayer(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    Original DiT layer implementation from [https://arxiv.org/pdf/2212.09748].
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        mlp_ratio: int = 4,
+        n_adaln_chunks: int = 6,
+        modulation_bias: bool = True,
+    ):
+        # Modify the mlp layer hidden_size of a dit layer according to mlp_ratio
+        config.ffn_hidden_size = int(mlp_ratio * config.hidden_size)
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+
+        self.adaLN = AdaLN(
+            config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=True
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # passing in conditioning information via attention mask here
+        c = attention_mask
+
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN(c)
+
+        shifted_input_layernorm_output = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0
+        )
+
+        x, bias = self.self_attention(shifted_input_layernorm_output, attention_mask=None)
+
+        hidden_states = self.adaLN.scale_add(hidden_states, x=(x + bias), gate=gate_msa)
+
+        residual = hidden_states
+
+        shited_pre_mlp_layernorm_output = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1
+        )
+
+        x, bias = self.mlp(shited_pre_mlp_layernorm_output)
+
+        hidden_states = self.adaLN.scale_add(residual, x=(x + bias), gate=gate_mlp)
+
+        return hidden_states, context
+
+
+class MMDiTLayer(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    MMDiT layer implementation from [https://arxiv.org/pdf/2403.03206].
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        context_pre_only: bool = False,
+    ):
+
+        hidden_size = config.hidden_size
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+
+        self.adaln = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True)
+
+        self.context_pre_only = context_pre_only
+        context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero"
+
+        if context_norm_type == "ada_norm_continous":
+            self.adaln_context = AdaLNContinous(config, hidden_size, modulation_bias=True, norm_type="layer_norm")
+        elif context_norm_type == "ada_norm_zero":
+            self.adaln_context = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True)
+        else:
+            raise ValueError(
+                f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`"
+            )
+        # Override Cross Attention to disable CP.
+        # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to incorrect tensor shapes.
+        cp_override_config = copy.deepcopy(config)
+        cp_override_config.context_parallel_size = 1
+        cp_override_config.tp_comm_overlap = False
+
+        if not context_pre_only:
+            self.context_mlp = build_module(
+                submodules.mlp,
+                config=cp_override_config,
+            )
+        else:
+            self.context_mlp = None
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+        emb=None,
+    ):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaln(emb)
+
+        norm_hidden_states = self.adaln.modulated_layernorm(
+            hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0
+        )
+        if self.context_pre_only:
+            norm_encoder_hidden_states = self.adaln_context(encoder_hidden_states, emb)
+        else:
+            c_shift_msa, c_scale_msa, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.adaln_context(emb)
+            norm_encoder_hidden_states = self.adaln_context.modulated_layernorm(
+                encoder_hidden_states, shift=c_shift_msa, scale=c_scale_msa, layernorm_idx=0
+            )
+
+        attention_output, encoder_attention_output = self.self_attention(
+            norm_hidden_states,
+            attention_mask=attention_mask,
+            key_value_states=None,
+            additional_hidden_states=norm_encoder_hidden_states,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+        hidden_states = self.adaln.scale_add(hidden_states, x=attention_output, gate=gate_msa)
+        norm_hidden_states = self.adaln.modulated_layernorm(
+            hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1
+        )
+
+        mlp_output, mlp_output_bias = self.mlp(norm_hidden_states)
+        hidden_states = self.adaln.scale_add(hidden_states, x=(mlp_output + mlp_output_bias), gate=gate_mlp)
+
+        if self.context_pre_only:
+            encoder_hidden_states = None
+        else:
+            encoder_hidden_states = self.adaln_context.scale_add(
+                encoder_hidden_states, x=encoder_attention_output, gate=c_gate_msa
+            )
+            norm_encoder_hidden_states = self.adaln_context.modulated_layernorm(
+                encoder_hidden_states, shift=c_shift_mlp, scale=c_scale_mlp, layernorm_idx=1
+            )
+
+            context_mlp_output, context_mlp_output_bias = self.context_mlp(norm_encoder_hidden_states)
+            encoder_hidden_states = self.adaln.scale_add(
+                encoder_hidden_states, x=(context_mlp_output + context_mlp_output_bias), gate=c_gate_mlp
+            )
+
+        return hidden_states, encoder_hidden_states
+
+
+class FluxSingleTransformerBlock(TransformerLayer):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        mlp_ratio: int = 4,
+        n_adaln_chunks: int = 3,
+        modulation_bias: bool = True,
+    ):
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+        hidden_size = config.hidden_size
+        self.adaln = AdaLN(
+            config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=False
+        )
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.proj_in = nn.Linear(hidden_size, self.mlp_hidden_dim)
+        self.activation = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+        emb=None,
+    ):
+        residual = hidden_states
+
+        shift, scale, gate = self.adaln(emb)
+
+        norm_hidden_states = self.adaln.modulated_layernorm(hidden_states, shift=shift, scale=scale)
+
+        mlp_hidden_states = self.activation(self.proj_in(norm_hidden_states))
+
+        attention_output = self.self_attention(
+            norm_hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb
+        )
+
+        hidden_states = torch.cat((attention_output, mlp_hidden_states), dim=2)
+
+        hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = self.adaln.scale_add(residual, x=hidden_states, gate=gate)
+
+        return hidden_states
+
+
 def get_stdit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
     params = {"attn_mask_type": AttnMaskType.padding}
     return ModuleSpec(
@@ -530,3 +795,77 @@ def get_official_dit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
             ),
         ),
     )
+
+
+def get_mm_dit_block_with_transformer_engine_spec() -> ModuleSpec:
+
+    return ModuleSpec(
+        module=MMDiTLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=JointSelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=JointSelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    added_linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+
+
+def get_flux_single_transformer_engine_spec() -> ModuleSpec:
+    return ModuleSpec(
+        module=FluxSingleTransformerBlock,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=FluxSingleAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    q_layernorm=RMSNorm,
+                    k_layernorm=RMSNorm,
+                    linear_proj=IdentityOp,
+                ),
+            ),
+        ),
+    )
+
+
+def get_flux_double_transformer_engine_spec() -> ModuleSpec:
+    return ModuleSpec(
+        module=MMDiTLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=JointSelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=JointSelfAttentionSubmodules(
+                    q_layernorm=RMSNorm,
+                    k_layernorm=RMSNorm,
+                    added_q_layernorm=RMSNorm,
+                    added_k_layernorm=RMSNorm,
+                    linear_qkv=TEColumnParallelLinear,
+                    added_linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
diff --git a/nemo/collections/diffusion/models/flux/__init__.py b/nemo/collections/diffusion/models/flux/__init__.py
new file mode 100644
index 000000000000..9e3250071955
--- /dev/null
+++ b/nemo/collections/diffusion/models/flux/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/models/flux/layers.py b/nemo/collections/diffusion/models/flux/layers.py
new file mode 100644
index 000000000000..222a9a1d67ae
--- /dev/null
+++ b/nemo/collections/diffusion/models/flux/layers.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+from torch import Tensor, nn
+
+
+def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+    """
+    Different from the original ROPE used for flux.
+    Megatron attention takes the out product and calculate sin/cos inside, so we only need to get the freqs here
+    in the shape of [seq, ..., dim]
+    """
+    assert dim % 2 == 0, "The dimension must be even."
+
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+
+    out = torch.einsum("...n,d->...nd", pos, omega)
+
+    return out.float()
+
+
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-1,
+        )
+        emb = emb.unsqueeze(1).permute(2, 0, 1, 3)
+        return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1)
+
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = True,
+    downscale_freq_shift: float = 0,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+class Timesteps(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        flip_sin_to_cos: bool = True,
+        downscale_freq_shift: float = 0,
+        scale: float = 1,
+        max_period: int = 10000,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+        self.max_period = max_period
+
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.embedding_dim,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+            max_period=self.max_period,
+        )
+        return t_emb
+
+
+class TimeStepEmbedder(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        hidden_dim: int,
+        flip_sin_to_cos: bool = True,
+        downscale_freq_shift: float = 0,
+        scale: float = 1,
+        max_period: int = 10000,
+    ):
+
+        super().__init__()
+
+        self.time_proj = Timesteps(
+            embedding_dim=embedding_dim,
+            flip_sin_to_cos=flip_sin_to_cos,
+            downscale_freq_shift=downscale_freq_shift,
+            scale=scale,
+            max_period=max_period,
+        )
+        self.time_embedder = MLPEmbedder(in_dim=embedding_dim, hidden_dim=hidden_dim)
+
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        timesteps_proj = self.time_proj(timesteps)
+        timesteps_emb = self.time_embedder(timesteps_proj)
+
+        return timesteps_emb
diff --git a/nemo/collections/diffusion/models/flux/model.py b/nemo/collections/diffusion/models/flux/model.py
new file mode 100644
index 000000000000..4d42c80a75a1
--- /dev/null
+++ b/nemo/collections/diffusion/models/flux/model.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import openai_gelu
+from torch import nn
+
+from nemo.collections.diffusion.models.dit.dit_layer_spec import (
+    AdaLNContinuous,
+    FluxSingleTransformerBlock,
+    MMDiTLayer,
+    get_flux_double_transformer_engine_spec,
+    get_flux_single_transformer_engine_spec,
+)
+from nemo.collections.diffusion.models.flux.layers import EmbedND, MLPEmbedder, TimeStepEmbedder
+
+
+@dataclass
+class FluxParams:
+    num_joint_layers: int = 19
+    num_single_layers: int = 38
+    hidden_size: int = 3072
+    num_attention_heads: int = 24
+    activation_func: Callable = openai_gelu
+    add_qkv_bias: bool = True
+    ffn_hidden_size: int = 16384
+    in_channels: int = 64
+    context_dim: int = 4096
+    model_channels: int = 256
+    patch_size: int = 1
+    guidance_embed: bool = False
+    vec_in_dim: int = 768
+
+
+class Flux(VisionModule):
+    def __init__(self, config: FluxParams):
+
+        self.out_channels = config.in_channels
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.patch_size = config.patch_size
+        self.in_channels = config.in_channels
+        self.guidance_embed = config.guidance_embed
+        transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=self.hidden_size,
+            num_attention_heads=self.num_attention_heads,
+            use_cpu_initialization=True,
+            activation_func=config.activation_func,
+            hidden_dropout=0,
+            attention_dropout=0,
+            layernorm_epsilon=1e-6,
+            add_qkv_bias=config.add_qkv_bias,
+            rotary_interleaved=True,
+        )
+        super().__init__(transformer_config)
+
+        self.pos_embed = EmbedND(dim=self.hidden_size, theta=10000, axes_dim=[16, 56, 56])
+        self.img_embed = nn.Linear(config.in_channels, self.hidden_size)
+        self.txt_embed = nn.Linear(config.context_dim, self.hidden_size)
+        self.timestep_embedding = TimeStepEmbedder(config.model_channels, self.hidden_size)
+        self.vector_embedding = MLPEmbedder(in_dim=config.vec_in_dim, hidden_dim=self.hidden_size)
+        if config.guidance_embed:
+            self.guidance_embedding = (
+                MLPEmbedder(in_dim=config.model_channels, hidden_dim=self.hidden_size)
+                if config.guidance_embed
+                else nn.Identity()
+            )
+
+        self.double_blocks = nn.ModuleList(
+            [
+                MMDiTLayer(
+                    config=transformer_config,
+                    submodules=get_flux_double_transformer_engine_spec().submodules,
+                    layer_number=i,
+                    context_pre_only=False,
+                )
+                for i in range(config.num_joint_layers)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    config=transformer_config,
+                    submodules=get_flux_single_transformer_engine_spec().submodules,
+                    layer_number=i,
+                )
+                for i in range(config.num_single_layers)
+            ]
+        )
+
+        self.norm_out = AdaLNContinuous(config=transformer_config, conditioning_embedding_dim=self.hidden_size)
+        self.proj_out = nn.Linear(self.hidden_size, self.patch_size * self.patch_size * self.out_channels, bias=True)
+
+    def forward(
+        self,
+        img: torch.Tensor,
+        txt: torch.Tensor = None,
+        y: torch.Tensor = None,
+        timesteps: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+    ):
+        hidden_states = self.img_embed(img)
+        encoder_hidden_states = self.txt_embed(txt)
+
+        timesteps = timesteps.to(img.dtype) * 1000
+        vec_emb = self.timestep_embedding(timesteps)
+
+        if guidance is not None:
+            vec_emb = vec_emb + self.guidance_embedding(self.timestep_embedding.time_proj(guidance * 1000))
+        vec_emb = vec_emb + self.vector_embedding(y)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        rotary_pos_emb = self.pos_embed(ids)
+        for id_block, block in enumerate(self.double_blocks):
+            hidden_states, encoder_hidden_states = block(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                rotary_pos_emb=rotary_pos_emb,
+                emb=vec_emb,
+            )
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=0)
+
+        for id_block, block in enumerate(self.single_blocks):
+            hidden_states = block(
+                hidden_states=hidden_states,
+                rotary_pos_emb=rotary_pos_emb,
+                emb=vec_emb,
+            )
+
+        hidden_states = hidden_states[encoder_hidden_states.shape[0] :, ...]
+
+        hidden_states = self.norm_out(hidden_states, vec_emb)
+        output = self.proj_out(hidden_states)
+
+        return output
diff --git a/nemo/collections/diffusion/models/flux/pipeline.py b/nemo/collections/diffusion/models/flux/pipeline.py
new file mode 100644
index 000000000000..e460f8f115bd
--- /dev/null
+++ b/nemo/collections/diffusion/models/flux/pipeline.py
@@ -0,0 +1,342 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from safetensors.torch import load_file as load_safetensors
+from safetensors.torch import save_file as save_safetensors
+from torch import nn
+from tqdm import tqdm
+
+from nemo.collections.diffusion.encoders.conditioner import FrozenCLIPEmbedder, FrozenT5Embedder
+from nemo.collections.diffusion.models.flux.model import Flux, FluxParams
+from nemo.collections.diffusion.sampler.flow_matching.flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+from nemo.collections.diffusion.utils.flux_ckpt_converter import flux_transformer_converter
+from nemo.collections.diffusion.utils.flux_pipeline_utils import FluxModelParams
+from nemo.collections.diffusion.vae.autoencoder import AutoEncoder
+
+
+class FluxInferencePipeline(nn.Module):
+    def __init__(self, params: FluxModelParams):
+        super().__init__()
+        self.device = params.device
+        params.clip_params['device'] = self.device
+        params.t5_params['device'] = self.device
+
+        self.vae = AutoEncoder(params.vae_params).to(self.device).eval()
+        self.clip_encoder = FrozenCLIPEmbedder(**params.clip_params)
+        self.t5_encoder = FrozenT5Embedder(**params.t5_params)
+        self.transformer = Flux(params.flux_params).to(self.device).eval()
+        self.vae_scale_factor = 2 ** (len(self.vae.params.ch_mult))
+        self.scheduler = FlowMatchEulerDiscreteScheduler(**params.scheduler_params)
+        self.params = params
+
+    def load_from_pretrained(self, ckpt_path, do_convert_from_hf=True, save_converted_model=None):
+        if do_convert_from_hf:
+            ckpt = flux_transformer_converter(ckpt_path, self.transformer.config)
+            if save_converted_model:
+                save_path = os.path.join(ckpt_path, 'nemo_flux_transformer.safetensors')
+                save_safetensors(ckpt, save_path)
+                print(f'saving converted transformer checkpoint to {save_path}')
+        else:
+            ckpt = load_safetensors(ckpt_path)
+        missing, unexpected = self.transformer.load_state_dict(ckpt, strict=False)
+        missing = [
+            k for k in missing if not k.endswith('_extra_state')
+        ]  # These keys are mcore specific and should not affect the model performance
+        if len(missing) > 0:
+            print(
+                f"The folloing keys are missing during checkpoint loading, please check the ckpt provided or the image quality may be compromised.\n {missing}"
+            )
+            print(f"Found unexepected keys: \n {unexpected}")
+
+    def encoder_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = 'cuda',
+        dtype: Optional[torch.dtype] = torch.float,
+    ):
+        if prompt is not None:
+            batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            batch_size = prompt_embeds.shape[0]
+        else:
+            raise ValueError("Either prompt or prompt_embeds must be provided.")
+        if device == 'cuda' and self.t5_encoder.device != device:
+            self.t5_encoder.to(device)
+        if prompt_embeds is None:
+            prompt_embeds = self.t5_encoder(prompt, max_sequence_length=max_sequence_length)
+        seq_len = prompt_embeds.shape[1]
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1).to(dtype=dtype)
+
+        if device == 'cuda' and self.clip_encoder.device != device:
+            self.clip_encoder.to(device)
+        if pooled_prompt_embeds is None:
+            _, pooled_prompt_embeds = self.clip_encoder(prompt)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1).to(dtype=dtype)
+
+        dtype = dtype if dtype is not None else self.t5_encoder.dtype
+        text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        text_ids = text_ids.repeat(num_images_per_prompt, 1, 1)
+
+        return prompt_embeds.transpose(0, 1), pooled_prompt_embeds, text_ids
+
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size: int, height: int, width: int, device: torch.device, dtype: torch.dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
+        latent_image_ids = latent_image_ids.reshape(
+            batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    @staticmethod
+    def _calculate_shift(
+        image_seq_len,
+        base_seq_len: int = 256,
+        max_seq_len: int = 4096,
+        base_shift: float = 0.5,
+        max_shift: float = 1.16,
+    ):
+        m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+        b = base_shift - m * base_seq_len
+        mu = image_seq_len * m + b
+        return mu
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * int(height) // self.vae_scale_factor
+        width = 2 * int(width) // self.vae_scale_factor
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = FluxInferencePipeline._generate_rand_latents(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        return latents.transpose(0, 1), latent_image_ids
+
+    @staticmethod
+    def _generate_rand_latents(
+        shape,
+        generator,
+        device,
+        dtype,
+    ):
+        if isinstance(generator, list):
+            shape = (1,) + shape[1:]
+            latents = [
+                torch.randn(shape, generator=generator[i], device=device, dtype=dtype, layout=layout)
+                for i in range(batch_size)
+            ]
+            latents = torch.cat(latents, dim=0).to(device=device)
+        else:
+            latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+
+        return latents
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
+
+    @staticmethod
+    def torch_to_numpy(images):
+        numpy_images = images.float().cpu().permute(0, 2, 3, 1).numpy()
+        return numpy_images
+
+    @staticmethod
+    def denormalize(image):
+        return (image / 2 + 0.5).clamp(0, 1)
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: int = 28,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        max_sequence_length: int = 512,
+        device: torch.device = 'cuda',
+        dtype: torch.dtype = torch.float32,
+        save_to_disk: bool = True,
+        offload: bool = True,
+    ):
+        assert device == 'cuda', 'Transformer blocks in Mcore must run on cuda devices'
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif prompt_embeds is not None and isinstance(prompt_embeds, torch.FloatTensor):
+            batch_size = prompt_embeds.shape[0]
+        else:
+            raise ValueError("Either prompt or prompt_embeds must be provided.")
+
+        ## get text prompt embeddings
+        prompt_embeds, pooled_prompt_embeds, text_ids = self.encoder_prompt(
+            prompt=prompt,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+        if offload:
+            self.t5_encoder.to('cpu')
+            self.clip_encoder.to('cpu')
+            torch.cuda.empty_cache()
+
+        ## prepare image latents
+        num_channels_latents = self.transformer.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt, num_channels_latents, height, width, dtype, device, generator, latents
+        )
+        # prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[0]
+
+        mu = FluxInferencePipeline._calculate_shift(
+            image_seq_len,
+            self.scheduler.base_image_seq_len,
+            self.scheduler.max_image_seq_len,
+            self.scheduler.base_shift,
+            self.scheduler.max_shift,
+        )
+
+        self.scheduler.set_timesteps(sigmas=sigmas, device=device, mu=mu)
+        timesteps = self.scheduler.timesteps
+
+        if device == 'cuda' and device != self.device:
+            self.transformer.to(device)
+        with torch.no_grad():
+            for i, t in tqdm(enumerate(timesteps)):
+                timestep = t.expand(latents.shape[1]).to(device=latents.device, dtype=latents.dtype)
+                if self.transformer.guidance_embed:
+                    guidance = torch.tensor([guidance_scale], device=device).expand(latents.shape[1])
+                else:
+                    guidance = None
+                with torch.autocast(device_type='cuda', dtype=latents.dtype):
+                    pred = self.transformer(
+                        img=latents,
+                        txt=prompt_embeds,
+                        y=pooled_prompt_embeds,
+                        timesteps=timestep / 1000,
+                        img_ids=latent_image_ids,
+                        txt_ids=text_ids,
+                        guidance=guidance,
+                    )
+                    latents = self.scheduler.step(pred, t, latents)[0]
+            if offload:
+                self.transformer.to('cpu')
+                torch.cuda.empty_cache()
+
+            if output_type == "latent":
+                return latents.transpose(0, 1)
+            elif output_type == "pil":
+                latents = self._unpack_latents(latents.transpose(0, 1), height, width, self.vae_scale_factor)
+                latents = (latents / self.vae.params.scale_factor) + self.vae.params.shift_factor
+                if device == 'cuda' and device != self.device:
+                    self.vae.to(device)
+                with torch.autocast(device_type='cuda', dtype=latents.dtype):
+                    image = self.vae.decode(latents)
+                if offload:
+                    self.vae.to('cpu')
+                    torch.cuda.empty_cache()
+                image = FluxInferencePipeline.denormalize(image)
+                image = FluxInferencePipeline.torch_to_numpy(image)
+                image = FluxInferencePipeline.numpy_to_pil(image)
+        if save_to_disk:
+            print('Saving to disk')
+            assert len(image) == int(len(prompt) * num_images_per_prompt)
+            prompt = [p[:40] + f'_{idx}' for p in prompt for idx in range(num_images_per_prompt)]
+            for file_name, image in zip(prompt, image):
+                image.save(f'{file_name}.png')
+
+        return image
diff --git a/nemo/collections/diffusion/sampler/flow_matching/__init__.py b/nemo/collections/diffusion/sampler/flow_matching/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/flow_matching/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py b/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py
new file mode 100644
index 000000000000..5bde6b0d1dc1
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py
@@ -0,0 +1,284 @@
+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from abc import ABC
+from typing import List, Optional, Tuple, Union
+
+
+import numpy as np
+import torch
+
+
+class FlowMatchEulerDiscreteScheduler(ABC):
+    """
+    Euler scheduler.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+
+    _compatibles = []
+    order = 1
+
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        use_dynamic_shifting=False,
+        base_shift: Optional[float] = 0.5,
+        max_shift: Optional[float] = 1.15,
+        base_image_seq_len: Optional[int] = 256,
+        max_image_seq_len: Optional[int] = 4096,
+    ):
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+
+        sigmas = timesteps / num_train_timesteps
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+
+        self.timesteps = sigmas * num_train_timesteps
+
+        self._step_index = None
+        self._begin_index = None
+
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+        self.base_shift = base_shift
+        self.max_shift = max_shift
+        self.base_image_seq_len = base_image_seq_len
+        self.max_image_seq_len = max_image_seq_len
+        self.use_dynamic_shifting = use_dynamic_shifting
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_noise(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Forward process in flow-matching
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
+
+        if sample.device.type == "mps" and torch.is_floating_point(timestep):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+            timestep = timestep.to(sample.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(sample.device)
+            timestep = timestep.to(sample.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timestep.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timestep.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(sample.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        sample = sigma * noise + (1.0 - sigma) * sample
+
+        return sample
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.num_train_timesteps
+
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[float] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        if self.use_dynamic_shifting and mu is None:
+            raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+
+        if sigmas is None:
+            self.num_inference_steps = num_inference_steps
+            timesteps = np.linspace(
+                self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
+            )
+
+            sigmas = timesteps / self.num_train_timesteps
+
+        if self.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)
+        else:
+            sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas)
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+        timesteps = sigmas * self.num_train_timesteps
+
+        self.timesteps = timesteps.to(device=device)
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self._step_index = None
+        self._begin_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+
+        Returns:
+            A tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+        prev_sample = sample + (sigma_next - sigma) * model_output
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        return (prev_sample,)
+
+    def __len__(self):
+        return self.num_train_timesteps
diff --git a/nemo/collections/diffusion/utils/__init__.py b/nemo/collections/diffusion/utils/__init__.py
new file mode 100644
index 000000000000..9e3250071955
--- /dev/null
+++ b/nemo/collections/diffusion/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/utils/flux_ckpt_converter.py b/nemo/collections/diffusion/utils/flux_ckpt_converter.py
new file mode 100644
index 000000000000..444a77bfad68
--- /dev/null
+++ b/nemo/collections/diffusion/utils/flux_ckpt_converter.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+from safetensors.torch import load_file as load_safetensors
+
+
+def _import_qkv_bias(transformer_config, qb, kb, vb):
+
+    head_num = transformer_config.num_attention_heads
+    num_query_groups = transformer_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = transformer_config.hidden_size
+    head_num = transformer_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    new_q_bias_tensor_shape = (head_num, head_size)
+    new_kv_bias_tensor_shape = (num_query_groups, head_size)
+
+    qb = qb.view(*new_q_bias_tensor_shape)
+    kb = kb.view(*new_kv_bias_tensor_shape)
+    vb = vb.view(*new_kv_bias_tensor_shape)
+
+    qkv_bias_l = []
+    for i in range(num_query_groups):
+        qkv_bias_l.append(qb[i * heads_per_group : (i + 1) * heads_per_group, :])
+        qkv_bias_l.append(kb[i : i + 1, :])
+        qkv_bias_l.append(vb[i : i + 1, :])
+
+    qkv_bias = torch.cat(qkv_bias_l)
+    qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)])
+
+    return qkv_bias
+
+
+def _import_qkv(transformer_config, q, k, v):
+
+    head_num = transformer_config.num_attention_heads
+    num_query_groups = transformer_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = transformer_config.hidden_size
+    head_num = transformer_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+key_mapping = {
+    'double_blocks': {
+        'norm1.linear.weight': 'adaln.adaLN_modulation.1.weight',
+        'norm1.linear.bias': 'adaln.adaLN_modulation.1.bias',
+        'norm1_context.linear.weight': 'adaln_context.adaLN_modulation.1.weight',
+        'norm1_context.linear.bias': 'adaln_context.adaLN_modulation.1.bias',
+        'attn.norm_q.weight': 'self_attention.q_layernorm.weight',
+        'attn.norm_k.weight': 'self_attention.k_layernorm.weight',
+        'attn.norm_added_q.weight': 'self_attention.added_q_layernorm.weight',
+        'attn.norm_added_k.weight': 'self_attention.added_k_layernorm.weight',
+        'attn.to_out.0.weight': 'self_attention.linear_proj.weight',
+        'attn.to_out.0.bias': 'self_attention.linear_proj.bias',
+        'attn.to_add_out.weight': 'self_attention.added_linear_proj.weight',
+        'attn.to_add_out.bias': 'self_attention.added_linear_proj.bias',
+        'ff.net.0.proj.weight': 'mlp.linear_fc1.weight',
+        'ff.net.0.proj.bias': 'mlp.linear_fc1.bias',
+        'ff.net.2.weight': 'mlp.linear_fc2.weight',
+        'ff.net.2.bias': 'mlp.linear_fc2.bias',
+        'ff_context.net.0.proj.weight': 'context_mlp.linear_fc1.weight',
+        'ff_context.net.0.proj.bias': 'context_mlp.linear_fc1.bias',
+        'ff_context.net.2.weight': 'context_mlp.linear_fc2.weight',
+        'ff_context.net.2.bias': 'context_mlp.linear_fc2.bias',
+    },
+    'single_blocks': {
+        'norm.linear.weight': 'adaln.adaLN_modulation.1.weight',
+        'norm.linear.bias': 'adaln.adaLN_modulation.1.bias',
+        'proj_mlp.weight': 'proj_in.weight',
+        'proj_mlp.bias': 'proj_in.bias',
+        'proj_out.weight': 'proj_out.weight',
+        'proj_out.bias': 'proj_out.bias',
+        'attn.norm_q.weight': 'self_attention.q_layernorm.weight',
+        'attn.norm_k.weight': 'self_attention.k_layernorm.weight',
+    },
+    'norm_out.linear.bias': 'norm_out.adaLN_modulation.1.bias',
+    'norm_out.linear.weight': 'norm_out.adaLN_modulation.1.weight',
+    'proj_out.bias': 'proj_out.bias',
+    'proj_out.weight': 'proj_out.weight',
+    'time_text_embed.guidance_embedder.linear_1.bias': 'guidance_embedding.in_layer.bias',
+    'time_text_embed.guidance_embedder.linear_1.weight': 'guidance_embedding.in_layer.weight',
+    'time_text_embed.guidance_embedder.linear_2.bias': 'guidance_embedding.out_layer.bias',
+    'time_text_embed.guidance_embedder.linear_2.weight': 'guidance_embedding.out_layer.weight',
+    'x_embedder.bias': 'img_embed.bias',
+    'x_embedder.weight': 'img_embed.weight',
+    'time_text_embed.timestep_embedder.linear_1.bias': 'timestep_embedding.time_embedder.in_layer.bias',
+    'time_text_embed.timestep_embedder.linear_1.weight': 'timestep_embedding.time_embedder.in_layer.weight',
+    'time_text_embed.timestep_embedder.linear_2.bias': 'timestep_embedding.time_embedder.out_layer.bias',
+    'time_text_embed.timestep_embedder.linear_2.weight': 'timestep_embedding.time_embedder.out_layer.weight',
+    'context_embedder.bias': 'txt_embed.bias',
+    'context_embedder.weight': 'txt_embed.weight',
+    'time_text_embed.text_embedder.linear_1.bias': 'vector_embedding.in_layer.bias',
+    'time_text_embed.text_embedder.linear_1.weight': 'vector_embedding.in_layer.weight',
+    'time_text_embed.text_embedder.linear_2.bias': 'vector_embedding.out_layer.bias',
+    'time_text_embed.text_embedder.linear_2.weight': 'vector_embedding.out_layer.weight',
+}
+
+
+def flux_transformer_converter(ckpt_path=None, transformer_config=None):
+    diffuser_state_dict = {}
+    if os.path.isdir(ckpt_path):
+        files = os.listdir(ckpt_path)
+        for file in files:
+            if file.endswith('.safetensors'):
+                loaded_dict = load_safetensors(os.path.join(ckpt_path, file))
+                diffuser_state_dict.update(loaded_dict)
+    elif os.path.isfile(ckpt_path):
+        diffuser_state_dict = load_safetensors(ckpt_path)
+    else:
+        raise FileNotFoundError("Please provide a valid ckpt path.")
+    new_state_dict = {}
+    num_single_blocks = 0
+    num_double_blocks = 0
+    for key, value in diffuser_state_dict.items():
+        if 'attn.to_q' in key or 'attn.to_k' in key or 'attn.to_v' in key:
+            continue
+        if 'attn.add_q_proj' in key or 'attn.add_k_proj' in key or 'attn.add_v_proj' in key:
+            continue
+        if key.startswith('transformer_blocks'):
+            temp = key.split('.')
+            idx, k = temp[1], '.'.join(temp[2:])
+            num_double_blocks = max(int(idx), num_double_blocks)
+            new_key = '.'.join(['double_blocks', idx, key_mapping['double_blocks'][k]])
+        elif key.startswith('single_transformer_blocks'):
+            temp = key.split('.')
+            idx, k = temp[1], '.'.join(temp[2:])
+            num_single_blocks = max(int(idx), num_single_blocks)
+            new_key = '.'.join(['single_blocks', idx, key_mapping['single_blocks'][k]])
+        else:
+            new_key = key_mapping[key]
+        new_state_dict[new_key] = value
+
+    for i in range(num_double_blocks + 1):
+        new_key = f'double_blocks.{str(i)}.self_attention.linear_qkv.weight'
+        qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.to_{n}.weight' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+        new_key = f'double_blocks.{str(i)}.self_attention.linear_qkv.bias'
+        qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.to_{n}.bias' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv_bias(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+        new_key = f'double_blocks.{str(i)}.self_attention.added_linear_qkv.weight'
+        qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.add_{n}_proj.weight' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+        new_key = f'double_blocks.{str(i)}.self_attention.added_linear_qkv.bias'
+        qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.add_{n}_proj.bias' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv_bias(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+
+    for i in range(num_single_blocks + 1):
+        new_key = f'single_blocks.{str(i)}.self_attention.linear_qkv.weight'
+        qk, kk, vk = [f'single_transformer_blocks.{str(i)}.attn.to_{n}.weight' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+        new_key = f'single_blocks.{str(i)}.self_attention.linear_qkv.bias'
+        qk, kk, vk = [f'single_transformer_blocks.{str(i)}.attn.to_{n}.bias' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv_bias(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+
+    return new_state_dict
diff --git a/nemo/collections/diffusion/utils/flux_pipeline_utils.py b/nemo/collections/diffusion/utils/flux_pipeline_utils.py
new file mode 100644
index 000000000000..77dcfa58450f
--- /dev/null
+++ b/nemo/collections/diffusion/utils/flux_pipeline_utils.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+import torch
+from megatron.core.transformer.utils import openai_gelu
+
+from nemo.collections.diffusion.models.flux.model import FluxParams
+from nemo.collections.diffusion.vae.autoencoder import AutoEncoderParams
+
+
+@dataclass
+class FluxModelParams:
+    flux_params: FluxParams
+    vae_params: AutoEncoderParams
+    clip_params: dict | None
+    t5_params: dict | None
+    scheduler_params: dict | None
+    device: str | torch.device
+
+
+configs = {
+    "dev": FluxModelParams(
+        flux_params=FluxParams(
+            num_joint_layers=19,
+            num_single_layers=38,
+            hidden_size=3072,
+            num_attention_heads=24,
+            activation_func=openai_gelu,
+            add_qkv_bias=True,
+            ffn_hidden_size=16384,
+            in_channels=64,
+            context_dim=4096,
+            model_channels=256,
+            patch_size=1,
+            guidance_embed=True,
+            vec_in_dim=768,
+        ),
+        vae_params=AutoEncoderParams(
+            ch_mult=[1, 2, 4, 4],
+            attn_resolutions=[],
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+            ckpt=None,
+        ),
+        clip_params={
+            'max_length': 77,
+            'always_return_pooled': True,
+        },
+        t5_params={
+            'max_length': 512,
+        },
+        scheduler_params={
+            'num_train_timesteps': 1000,
+        },
+        device='cpu',
+    )
+}
diff --git a/nemo/collections/diffusion/utils/mcore_parallel_utils.py b/nemo/collections/diffusion/utils/mcore_parallel_utils.py
new file mode 100644
index 000000000000..0b9bdec97464
--- /dev/null
+++ b/nemo/collections/diffusion/utils/mcore_parallel_utils.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Megatron Model Parallel Initialization
+"""
+
+import os
+
+import megatron.core.parallel_state as ps
+import torch
+
+
+class Utils:
+    world_size = torch.cuda.device_count()
+    # rank = int(os.environ["LOCAL_RANK"])
+    rank = 0
+
+    @staticmethod
+    def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1):
+        ps.destroy_model_parallel()
+
+        # Torch setup for distributed training
+        rank = int(os.environ['LOCAL_RANK'])
+        world_size = 1  # torch.cuda.device_count()
+        torch.cuda.set_device(rank)
+        torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+        # Megatron core distributed training initialization
+        ps.initialize_model_parallel(
+            tensor_model_parallel_size, pipeline_model_parallel_size, context_parallel_size=context_parallel_size
+        )
+
+    @staticmethod
+    def set_world_size(world_size=None, rank=None):
+        Utils.world_size = torch.cuda.device_count() if world_size is None else world_size
+        if torch.distributed.is_initialized() and Utils.world_size != torch.distributed.get_world_size():
+            torch.distributed.destroy_process_group()
+
+        if rank is None:
+            # Utils.rank = int(os.environ["LOCAL_RANK"])
+            Utils.rank = 0
+            if Utils.rank >= Utils.world_size:
+                Utils.rank = -1
+        else:
+            Utils.rank = rank
+
+    @staticmethod
+    def destroy_model_parallel():
+        ps.destroy_model_parallel()
+        torch.distributed.barrier()
+
+    @staticmethod
+    def initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        pipeline_model_parallel_split_rank=None,
+        **kwargs,
+    ):
+        ps.destroy_model_parallel()
+        Utils.initialize_distributed()
+        ps.initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size,
+            pipeline_model_parallel_split_rank,
+            **kwargs,
+        )
diff --git a/nemo/collections/diffusion/vae/autoencoder.py b/nemo/collections/diffusion/vae/autoencoder.py
new file mode 100644
index 000000000000..b356d74baac1
--- /dev/null
+++ b/nemo/collections/diffusion/vae/autoencoder.py
@@ -0,0 +1,334 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+from torch import Tensor, nn
+
+from nemo.collections.diffusion.vae.blocks import AttnBlock, Downsample, Normalize, ResnetBlock, Upsample, make_attn
+
+
+@dataclass
+class AutoEncoderParams:
+    ch_mult: list[int]
+    attn_resolutions: list[int]
+    resolution: int = 256
+    in_channels: int = 3
+    ch: int = 128
+    out_ch: int = 3
+    num_res_blocks: int = 2
+    z_channels: int = 16
+    scale_factor: float = 0.3611
+    shift_factor: float = 0.1159
+    attn_type: str = 'vanilla'
+    double_z: bool = True
+    dropout: float = 0.0
+    ckpt: str = None
+
+
+def nonlinearity(x):
+    # swish
+    return torch.nn.functional.silu(x)
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+        dropout=0.0,
+        resamp_with_conv=True,
+        double_z=True,
+        use_linear_attn=False,
+        attn_type="vanilla",
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
+        )
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, 2 * z_channels if double_z else z_channels, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+        dropout=0.0,
+        resamp_with_conv=True,
+        give_pre_end=False,
+        tanh_out=False,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        **ignorekwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
+        )
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+
+        # timestep embedding
+        temb = None
+
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        if self.give_pre_end:
+            return h
+
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+
+
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+
+
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+            double_z=params.double_z,
+            attn_type=params.attn_type,
+            dropout=params.dropout,
+            out_ch=params.out_ch,
+            attn_resolutions=params.attn_resolutions,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+            double_z=params.double_z,
+            attn_type=params.attn_type,
+            dropout=params.dropout,
+            attn_resolutions=params.attn_resolutions,
+        )
+        self.reg = DiagonalGaussian()
+
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+        self.params = params
+
+        if params.ckpt is not None:
+            self.load_from_checkpoint(params.ckpt)
+
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+
+    def load_from_checkpoint(self, ckpt_path):
+        from safetensors.torch import load_file as load_sft
+
+        state_dict = load_sft(ckpt_path)
+        missing, unexpected = self.load_state_dict(state_dict)
+        if len(missing) > 0:
+            logger.warning(f"Following keys are missing from checkpoint loaded: {missing}")
diff --git a/nemo/collections/diffusion/vae/blocks.py b/nemo/collections/diffusion/vae/blocks.py
new file mode 100644
index 000000000000..ad38a7a463cf
--- /dev/null
+++ b/nemo/collections/diffusion/vae/blocks.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
+
+
+def Normalize(in_channels, num_groups=32, act=""):
+    return GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True, act=act)
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, temb_channels=0):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels, act="silu")
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels, act="silu")
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = self.conv1(h)
+
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+
+        h = self.norm2(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+
+        return x + h
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # TODO(yuya): Remove this cast once the issue is fixed in PyTorch
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = x.dtype
+        if dtype == torch.bfloat16:
+            x = x.to(torch.float32)
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if dtype == torch.bfloat16:
+            x = x.to(dtype)
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels, act="silu")
+
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads=self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+
+
+class LinAttnBlock(LinearAttention):
+    """
+    to match AttnBlock usage
+    """
+
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+
+
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
+    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 5ddbcf5913ad..6dde88079567 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -21,6 +21,7 @@
 from nemo.collections.llm.gpt.data import (
     DollyDataModule,
     FineTuningDataModule,
+    HfDatasetDataModule,
     MockDataModule,
     PreTrainingDataModule,
     SquadDataModule,
@@ -57,6 +58,7 @@
     GPTConfig126M,
     GPTConfig175B,
     GPTModel,
+    HfAutoModelForCausalLM,
     Llama2Config7B,
     Llama2Config13B,
     Llama2Config70B,
@@ -70,6 +72,7 @@
     MaskedTokenLossReduction,
     MistralConfig7B,
     MistralModel,
+    MistralNeMoConfig12B,
     MixtralConfig8x3B,
     MixtralConfig8x7B,
     MixtralConfig8x22B,
@@ -115,6 +118,7 @@
     "t5_forward_step",
     "MaskedTokenLossReduction",
     "MistralConfig7B",
+    "MistralNeMoConfig12B",
     "MistralModel",
     "MixtralConfig8x3B",
     "MixtralConfig8x7B",
@@ -180,6 +184,7 @@
     "squad",
     "dolly",
     "peft",
+    "HfAutoModelForCausalLM",
 ]
 
 
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 71e006472db9..a9b3d4361f5b 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -436,7 +436,7 @@ def export_ckpt(
 def generate(
     path: Union[Path, str],
     prompts: list[str],
-    trainer: Optional[nl.Trainer] = None,
+    trainer: nl.Trainer,
     params_dtype: torch.dtype = torch.bfloat16,
     max_batch_size: int = 4,
     random_seed: Optional[int] = None,
diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py
index 45ca0788874f..f4e97d91e5cd 100644
--- a/nemo/collections/llm/gpt/data/__init__.py
+++ b/nemo/collections/llm/gpt/data/__init__.py
@@ -14,8 +14,16 @@
 
 from nemo.collections.llm.gpt.data.dolly import DollyDataModule
 from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
-__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
+__all__ = [
+    "FineTuningDataModule",
+    "SquadDataModule",
+    "DollyDataModule",
+    "MockDataModule",
+    "PreTrainingDataModule",
+    "HfDatasetDataModule",
+]
diff --git a/nemo/collections/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py
index 78751d60cdb0..fb8cf9fd5da0 100644
--- a/nemo/collections/llm/gpt/data/dolly.py
+++ b/nemo/collections/llm/gpt/data/dolly.py
@@ -26,6 +26,7 @@
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class DollyDataModule(FineTuningDataModule, IOMixin):
@@ -56,7 +57,7 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
     ):
         self.force_redownload = force_redownload
         self.delete_raw = delete_raw
@@ -74,7 +75,7 @@ def __init__(
             pin_memory=pin_memory,
             persistent_workers=persistent_workers,
             pad_to_max_length=pad_to_max_length,
-            packed_sequence_size=packed_sequence_size,
+            packed_sequence_specs=packed_sequence_specs,
         )
 
     def prepare_data(self) -> None:
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 3e4dba7ec89c..2545bbc93f1d 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -20,12 +20,14 @@
 import pytorch_lightning as pl
 from torch.utils.data import DataLoader
 
+from nemo.collections.common.tokenizers import AutoTokenizer
 from nemo.collections.llm.gpt.data.core import create_sft_dataset
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 from nemo.utils import logging
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class FineTuningDataModule(pl.LightningDataModule):
@@ -50,10 +52,7 @@ class FineTuningDataModule(pl.LightningDataModule):
         persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False.
         max_train_steps (int, optional): Maximum number of steps to train. Used to calculate samples mapping for the mmap dataset
         pad_to_max_length (bool, optional): Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
-        packed_sequence_size (int, optional): If a positive integer, this arg enables training with sequence packing and specifies the pack size
-            If less than or equal to 0, sequence packing is disabled. Defaults to -1.
-            Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length of the original sequence
-            (i.e. the length to truncate long sequences in the input data).
+        packed_sequence_specs (PackedSequenceSpecs, optional): See PackedSequenceSpecs for details
     """
 
     def __init__(
@@ -70,7 +69,8 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
+        sanity_check_dist_workers: bool = True,
     ):
         super().__init__()
         self.seq_length = seq_length
@@ -87,22 +87,22 @@ def __init__(
         self.data_sampler = None
         self.max_train_samples = None
         self.pad_to_max_length = pad_to_max_length
-        self.packed_sequence_size = packed_sequence_size
-        self._adjust_batch_sizes_for_packed_sequence()
+        self.packed_sequence_specs = packed_sequence_specs
+        self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size
+        self.validate_batch_size_for_packed_sequence()
+        self._sanity_check_dist_workers = sanity_check_dist_workers
 
-    def _adjust_batch_sizes_for_packed_sequence(self):
+    def validate_batch_size_for_packed_sequence(self):
         if self.packed_sequence_size > 0 and self.micro_batch_size > 1:
-            logging.warning(
+            raise ValueError(
                 "Micro batch size should be 1 when training with packed sequence, but your micro batch size "
-                f"is {self.micro_batch_size}. Your config will be automatically updated to the following: "
-                f"MBS will be set to 1 (from {self.micro_batch_size}), "
-                f"GBS will be set to {self.global_batch_size // self.micro_batch_size} (from {self.global_batch_size}), "
-                f"packed sequence length will be set to {self.packed_sequence_size*self.micro_batch_size} (from {self.packed_sequence_size}). "
+                f"is {self.micro_batch_size}. \nThe following config is equivalent to your current setting for "
+                f"a packed dataset. Please update your config to the following: \n"
+                f"Set micro batch size to 1 (currently {self.micro_batch_size})\n"
+                f"Set global batch size to {self.global_batch_size // self.micro_batch_size} (currently {self.global_batch_size}) \n"
+                f"Set packed sequence length to {self.packed_sequence_size*self.micro_batch_size} (currently {self.packed_sequence_size}) \n"
                 f"For details please visit https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/optimizations/sequence_packing.html"
             )
-            self.global_batch_size //= self.micro_batch_size
-            self.packed_sequence_size *= self.micro_batch_size
-            self.micro_batch_size = 1
 
     def prepare_data(self) -> None:
         if self.packed_sequence_size > 0 and not self.train_path_packed.is_file():
@@ -136,6 +136,7 @@ def train_dataloader(self) -> DataLoader:
                 self.train_path if self.packed_sequence_size <= 0 else self.train_path_packed,
                 max_num_samples=self.max_train_samples,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             )
         )
 
@@ -145,6 +146,7 @@ def val_dataloader(self) -> DataLoader:
                 self.validation_path,
                 is_test=True,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             ),
         )
 
@@ -155,6 +157,7 @@ def test_dataloader(self) -> DataLoader:
                 tokens_to_generate=32,
                 is_test=True,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             )
         )
 
@@ -187,7 +190,12 @@ def train_path(self) -> Path:
     @property
     def train_path_packed(self) -> Path:
         if self.packed_sequence_size > 0:
-            return self.dataset_root / f"training_packed{self.packed_sequence_size}.npy"
+            if self.packed_sequence_specs.packed_data_path is not None:
+                return self.packed_sequence_specs.packed_data_path
+            tokenizer_model_name = self._extract_tokenizer_model_name()
+            folder_name = self.dataset_root / "packed" / tokenizer_model_name
+            folder_name.mkdir(parents=True, exist_ok=True)
+            return folder_name / f"training_{self.packed_sequence_size}.npy"
         else:
             raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.")
 
@@ -198,3 +206,18 @@ def validation_path(self) -> Path:
     @property
     def test_path(self) -> Path:
         return self.dataset_root / "test.jsonl"
+
+    def _extract_tokenizer_model_name(self) -> str:
+        if self.packed_sequence_specs.tokenizer_model_name is not None:
+            tokenizer_model_name = self.packed_sequence_specs.tokenizer_model_name
+        elif isinstance(self.tokenizer, AutoTokenizer):
+            name = self.tokenizer.tokenizer.name_or_path
+            if name.endswith("nemo_tokenizer"):
+                # NEMO_HOME/hf_org/hf_model/nemo_tokenizer => hf_org--hf_model
+                tokenizer_model_name = '--'.join(name.split("/")[-3:-1])
+            else:
+                # hf_org/hf_model => hf_org--hf_model
+                tokenizer_model_name = name.replace("/", "--")
+        else:
+            tokenizer_model_name = f"unknown_tokenizer_{hash(self.tokenizer)}"
+        return tokenizer_model_name
diff --git a/nemo/collections/llm/gpt/data/hf_dataset.py b/nemo/collections/llm/gpt/data/hf_dataset.py
new file mode 100644
index 000000000000..7e70a970913e
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/hf_dataset.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytorch_lightning as pl
+import torch
+from torch.utils.data import DataLoader
+
+
+class HfDatasetDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        dataset,
+        num_workers=2,
+        pin_memory=True,
+        persistent_workers=True,
+        micro_batch_size=2,
+        global_batch_size=2,
+        pad_token_id=0,
+        use_mcore_sampler=False,
+        mcore_dataloader_type='cyclic',
+    ) -> None:
+        super().__init__()
+        assert pad_token_id is not None
+
+        self.dataset = dataset
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.pad_token_id = pad_token_id
+
+        self.use_mcore_sampler = use_mcore_sampler
+        self.mcore_dataloader_type = mcore_dataloader_type
+
+    @staticmethod
+    def collate_fn(batch, pad_token_id=0):
+        def batchify(tensor):
+            if tensor.ndim == 1:
+                return tensor.unsqueeze_(0)
+            return tensor
+
+        def extract_key_from_dicts(batch, key):
+            return list(map(lambda x: x[key], batch))
+
+        def pad_within_micro(batch, pad_token_id):
+            max_len = max(map(len, batch))
+            return [item + [pad_token_id] * (max_len - len(item)) for item in batch]
+
+        return {
+            key: batchify(
+                torch.LongTensor(
+                    pad_within_micro(
+                        extract_key_from_dicts(batch, key),
+                        pad_token_id,
+                    )
+                )
+            )
+            for key in ['tokens', 'labels']
+        }
+
+    def train_dataloader(self, collate_fn=None):
+        from nemo.lightning.data import add_megatron_sampler
+
+        if collate_fn is None:
+            collate_fn = lambda x: HfDatasetDataModule.collate_fn(x, pad_token_id=self.pad_token_id)
+
+        dataloader = DataLoader(
+            self.dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=collate_fn,
+            batch_size=self.micro_batch_size,
+        )
+        if not self.use_mcore_sampler:
+            return dataloader
+
+        rank = 0
+        world_size = 1
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+
+        return add_megatron_sampler(
+            dataloader,
+            self.micro_batch_size,
+            self.global_batch_size,
+            dataloader_type=self.mcore_dataloader_type,
+            rank=rank,
+            world_size=world_size,
+        )
diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
index 1c5e01c89bbd..5678597eda0b 100644
--- a/nemo/collections/llm/gpt/data/mock.py
+++ b/nemo/collections/llm/gpt/data/mock.py
@@ -56,9 +56,13 @@ def __init__(
         self.persistent_workers = persistent_workers
         self.create_attention_mask = create_attention_mask or not HAVE_TE
 
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        if tokenizer is None:
+            from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+            self.tokenizer = get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
+        else:
+            self.tokenizer = tokenizer
 
-        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
             micro_batch_size=micro_batch_size,
diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py
index 4675b3fbb398..372e851da7cd 100644
--- a/nemo/collections/llm/gpt/data/packed_sequence.py
+++ b/nemo/collections/llm/gpt/data/packed_sequence.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 
@@ -83,3 +83,32 @@ def prepare_packed_sequence_data(
     # save output data
     np.save(output_path, output_data)
     logging.info(f"Packed sequence is prepared and saved to {output_path}")
+
+
+@dataclass
+class PackedSequenceSpecs:
+    packed_sequence_size: int = -1
+    """
+    If a positive integer, this arg enables training with sequence packing and specifies the pack size
+    If less than or equal to 0, sequence packing is disabled. Defaults to -1.
+    Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length of the original sequence
+    (i.e. the length to truncate long sequences in the input data).
+    """
+
+    tokenizer_model_name: str = None
+    """
+    Keep track of tokenizer model name, since each tokenizer produces a different packed sequence dataset file.
+    This field is set by llm.finetune api.
+    """
+
+    packed_data_path: Path = None
+    """
+    If specified, use the packed dataset from this file instead of the default path.
+    """
+
+    def __post_init__(self):
+        if self.packed_data_path is not None:
+            assert (
+                self.packed_data_path.suffix == ".npy"
+            ), f"packed data file must be a .npy file: {self.packed_data_path}"
+            assert self.packed_data_path.exists(), f"packed data file does not exist: {self.packed_data_path}"
diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
index ec0fc1aad02c..cabbd444c0cf 100644
--- a/nemo/collections/llm/gpt/data/squad.py
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -24,6 +24,7 @@
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class SquadDataModule(FineTuningDataModule, IOMixin):
@@ -54,7 +55,8 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
+        sanity_check_dist_workers: bool = True,
     ):
         self.force_redownload = force_redownload
         self.delete_raw = delete_raw
@@ -72,7 +74,8 @@ def __init__(
             pin_memory=pin_memory,
             persistent_workers=persistent_workers,
             pad_to_max_length=pad_to_max_length,
-            packed_sequence_size=packed_sequence_size,
+            packed_sequence_specs=packed_sequence_specs,
+            sanity_check_dist_workers=sanity_check_dist_workers,
         )
 
     def prepare_data(self) -> None:
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index aa3615b3ddfd..26b8d67cb53d 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -37,6 +37,7 @@
     GemmaConfig7B,
     GemmaModel,
 )
+from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
 from nemo.collections.llm.gpt.model.llama import (
     CodeLlamaConfig7B,
     CodeLlamaConfig13B,
@@ -53,7 +54,7 @@
     LlamaConfig,
     LlamaModel,
 )
-from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel, MistralNeMoConfig12B
 from nemo.collections.llm.gpt.model.mixtral import (
     MixtralConfig8x3B,
     MixtralConfig8x7B,
@@ -166,4 +167,5 @@
     "gpt_forward_step",
     "transformer_engine_layer_spec",
     "local_layer_spec",
+    "HfAutoModelForCausalLM",
 ]
diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py
index 56231978061f..c283b802a118 100644
--- a/nemo/collections/llm/gpt/model/baichuan.py
+++ b/nemo/collections/llm/gpt/model/baichuan.py
@@ -215,7 +215,7 @@ def _import_qkv(ctx: io.TransformCTX, qkv_weights):
     q = qkv_weights[0].squeeze().view(*new_q_tensor_shape)
     k = qkv_weights[1].squeeze().view(*new_kv_tensor_shape)
     v = qkv_weights[2].squeeze().view(*new_kv_tensor_shape)
-    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]).type_as(qkv_weights)
     for i in range(num_query_groups):
         qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
         qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index f48f4a15d327..c7a6e01c673e 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -204,6 +204,9 @@ class GPTConfig5B(GPTConfig):
     ffn_hidden_size: int = 16384
     num_attention_heads: int = 32
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 @dataclass
 class GPTConfig7B(GPTConfig):
@@ -222,6 +225,9 @@ class GPTConfig20B(GPTConfig):
     ffn_hidden_size: int = 24576
     num_attention_heads: int = 48
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 @dataclass
 class GPTConfig40B(GPTConfig):
@@ -240,6 +246,9 @@ class GPTConfig175B(GPTConfig):
     ffn_hidden_size: int = 49152
     num_attention_heads: int = 96
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
     def __init__(
diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py
index 5bd1319102e2..e7450a8db28d 100644
--- a/nemo/collections/llm/gpt/model/chatglm.py
+++ b/nemo/collections/llm/gpt/model/chatglm.py
@@ -221,7 +221,7 @@ def _import_qkv_weight(ctx: io.TransformCTX, hf_qkv_weights):
     k = k.view(*new_kv_tensor_shape)
     v = v.view(*new_kv_tensor_shape)
 
-    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1]))
+    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])).type_as(hf_qkv_weights)
     for i in range(num_query_groups):
         qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
         qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
@@ -251,7 +251,7 @@ def _import_qkv_bias(ctx: io.TransformCTX, hf_qkv_bias):
     q = q.view(*new_q_tensor_shape)
     k = k.view(*new_kv_tensor_shape)
     v = v.view(*new_kv_tensor_shape)
-    qkv_bias = torch.empty((0, head_size))
+    qkv_bias = torch.empty((0, head_size)).type_as(hf_qkv_bias)
     for i in range(num_query_groups):
         qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
         qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
new file mode 100644
index 000000000000..f29756dc05a7
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.llm import fn
+from nemo.lightning import io
+
+
+def _extract_non_bias_params(model):
+    return list(map(lambda x: x[1], filter(lambda x: not 'bias' in x[0], model.named_parameters())))
+
+
+def masked_cross_entropy(logits, targets, mask=None):
+    if mask is not None:
+        loss = F.cross_entropy(logits, targets, reduction='none')
+        return torch.mean(loss[mask == 1])
+    else:
+        return F.cross_entropy(logits, targets)
+
+
+class HfAutoModelForCausalLM(pl.LightningModule, io.IOMixin, fn.FNMixin):
+    def __init__(self, model_name='gpt2', load_pretrained_weights=True, tokenizer=None, loss_fn=masked_cross_entropy):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model_name = model_name
+        self._tokenizer = None
+        self.model = None
+        self.loss_fn = loss_fn
+        self.load_pretrained_weights = load_pretrained_weights
+        self.is_hf_model = True
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name)
+        return self._tokenizer
+
+    @tokenizer.setter
+    def tokenizer(self, value):
+        assert self._tokenizer is None
+        self._tokenizer = value
+
+    @staticmethod
+    def configure_tokenizer(model_name):
+        return AutoTokenizer(model_name)
+
+    def configure_model(self):
+        # create all your layers here
+        if self.load_pretrained_weights:
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype='auto')
+        else:
+            from transformers import AutoConfig
+
+            config = AutoConfig.from_pretained(self.model_name)
+            self.model = AutoModelForCausalLM.from_config(config)
+        self.model.train()
+
+    def forward(self, input_ids, attention_mask=None, labels=None, loss_mask=None):
+        outputs = self.model(
+            input_ids=input_ids.to(self.model.device),
+            attention_mask=attention_mask,
+        )
+        labels = labels.to(self.model.device)
+        if loss_mask is not None:
+            loss_mask = loss_mask.to(self.model.device).view(-1)
+        n_cls = outputs.logits.shape[-1]
+        outputs.loss = self.loss_fn(outputs.logits.view(-1, n_cls), labels.view(-1), loss_mask)
+        return outputs
+
+    def training_step(self, batch):
+        tokens = batch['tokens']
+        labels = batch['labels']
+        loss_mask = batch.get('loss_mask', None)
+        output = self.forward(
+            input_ids=tokens,
+            labels=labels,
+            loss_mask=loss_mask,
+        )
+
+        loss = output.loss
+        self.log('train_log', loss, on_step=True, on_epoch=True, prog_bar=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        tokens = batch['tokens']
+        labels = batch['labels']
+        output = self.forward(
+            input_ids=tokens,
+            labels=labels,
+        )
+
+        loss = output.loss
+        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 0ec13a3d91e8..b48f99e061c9 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -50,6 +50,13 @@ class LlamaConfig(GPTConfig):
     attention_dropout: float = 0.0
     hidden_dropout: float = 0.0
     share_embeddings_and_output_weights: bool = False
+    # Fusions
+    bias_activation_fusion: bool = True
+    masked_softmax_fusion: bool = True
+    persist_layer_norm: bool = True
+    bias_dropout_fusion: bool = True
+    apply_rope_fusion: bool = True
+    cross_entropy_loss_fusion: bool = False
 
 
 @dataclass
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index f353362c9cbd..b9f4b6fb8f65 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -59,7 +59,7 @@ class MistralConfig7B(GPTConfig):
 
 
 @dataclass
-class MistralNeMo2407Config12B(MistralConfig7B):
+class MistralNeMoConfig12B(MistralConfig7B):
     """
     https://mistral.ai/news/mistral-nemo/
     """
@@ -75,7 +75,7 @@ class MistralNeMo2407Config12B(MistralConfig7B):
 
 
 @dataclass
-class MistralNeMo2407Config123B(MistralConfig7B):
+class MistralNeMoConfig123B(MistralConfig7B):
     """
     https://mistral.ai/news/mistral-large-2407/
     """
diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
index 954fa8bfe9f7..d38a690cb4ad 100644
--- a/nemo/collections/llm/gpt/model/ssm.py
+++ b/nemo/collections/llm/gpt/model/ssm.py
@@ -53,6 +53,9 @@ class SSMConfig(TransformerConfig, io.IOMixin):
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
     share_embeddings_and_output_weights: bool = False
+    params_dtype: torch.dtype = torch.bfloat16
+    fp16: bool = False
+    bf16: bool = True
     num_layers: int = 2
     mamba_ssm_ngroups: int = 8
     num_attention_heads: int = 1
@@ -81,6 +84,7 @@ class SSMConfig(TransformerConfig, io.IOMixin):
 
     forward_step_fn: Callable = ssm_forward_step
     data_step_fn: Callable = gpt_data_step
+    tokenizer_model_path: str = None
 
     def configure_model(self, tokenizer) -> "MCoreMambaModel":
 
@@ -127,9 +131,17 @@ def __init__(self, state_dict):
             def state_dict(self):
                 return self._state_dict
 
+            def to(self, dtype):
+                for k, v in self._state_dict.items():
+                    if v.dtype != dtype:
+                        logging.warning(f"Converting {k} from {v.dtype} (source model) to {dtype} (target model)")
+                    self._state_dict[k] = v.to(dtype)
+
         source = ModelState(source)
         target = self.init()
         trainer = self.nemo_setup(target)
+        source.to(self.config.params_dtype)
+        target.to(self.config.params_dtype)
         self.convert_state(source, target)
         self.nemo_save(output_path, trainer)
 
diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
index 95da536fde06..0171f1c2dd5c 100644
--- a/nemo/collections/llm/inference/base.py
+++ b/nemo/collections/llm/inference/base.py
@@ -16,6 +16,7 @@
 
 import nemo.lightning as nl
 from nemo.lightning import io
+from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.lightning.pytorch.strategies.utils import RestoreConfig
 
@@ -44,6 +45,7 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
         load_optim_state=False,
     )
     trainer.strategy.restore_config = restore_config
+    trainer.strategy._setup_optimizers = False
     trainer.ckpt_path = None
     trainer.strategy.connect(model)
     if trainer.strategy.launcher is not None:
@@ -61,16 +63,22 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
 
 def setup_model_and_tokenizer(
     path: Path,
-    trainer: Optional[nl.Trainer] = None,
+    trainer: nl.Trainer,
     params_dtype: torch.dtype = torch.bfloat16,
     inference_batch_times_seqlen_threshold: int = 1000,
 ) -> tuple[MCoreGPTModel, MCoreTokenizerWrappper]:
-    model: io.TrainerContext = io.load_context(path=path, subpath="model")
-    trainer = trainer or io.load_context(path=path, subpath="trainer")
+    model: io.TrainerContext = io.load_context(path=ckpt_to_context_subdir(path), subpath="model")
     _setup_trainer_and_restore_model(path=path, trainer=trainer, model=model)
 
     # This is to get the MCore model required in GPTInferenceWrapper.
-    mcore_model = model.module.module.module
+    mcore_model = model
+    while mcore_model:
+        if type(mcore_model) is MCoreGPTModel:
+            break
+        mcore_model = getattr(mcore_model, "module", None)
+    if mcore_model is None or type(mcore_model) is not MCoreGPTModel:
+        raise ValueError("Exact McoreGPTModel instance not found in the model structure.")
+
     inference_wrapped_model = GPTInferenceWrapper(
         mcore_model,
         InferenceWrapperConfig(
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index db4861e9e987..ecebf696a42c 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import re
 from dataclasses import dataclass, field
 from typing import List, Literal
 
+import torch
 from megatron.core import parallel_state
+from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear
 from torch import nn
 
 from nemo.lightning.pytorch.callbacks.peft import PEFT, AdapterWrapper
@@ -23,15 +27,16 @@
 from nemo.utils.import_utils import safe_import_from
 
 TEColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine", "TEColumnParallelLinear"
+    "megatron.core.extensions.transformer_engine", "TEColumnParallelLinear"
 )
-TELayerNormColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine",
+TELayerNormColumnParallelLinear, HAVE_TE_LN_COL_LINEAR = safe_import_from(
+    "megatron.core.extensions.transformer_engine",
     "TELayerNormColumnParallelLinear",
 )
 TERowParallelLinear, HAVE_TE_ROW_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine", "TERowParallelLinear"
+    "megatron.core.extensions.transformer_engine", "TERowParallelLinear"
 )
+HAVE_TE = all((HAVE_TE_COL_LINEAR, HAVE_TE_LN_COL_LINEAR, HAVE_TE_ROW_LINEAR))
 
 
 class AdapterParallelAdd(AdapterWrapper):
@@ -66,6 +71,49 @@ def forward(self, x):
         return linear_output + adapter_output, bias
 
 
+class LinearAdapter(nn.Module):
+    def __init__(
+        self, orig_linear, dim=8, alpha=32, dropout=0.1, dropout_position='post', lora_A_init_method='xavier'
+    ):
+        super(LinearAdapter, self).__init__()
+        assert isinstance(orig_linear, nn.Linear)
+
+        self.orig_linear = orig_linear
+        self.dim = dim
+        self.scale = alpha / dim
+
+        # Freezer
+        device = self.orig_linear.weight.device
+        self.orig_linear.weight.requires_grad = False
+        if self.orig_linear.bias is not None:
+            self.orig_linear.bias.requires_grad = False
+
+        in_features = self.orig_linear.in_features
+        out_features = self.orig_linear.out_features
+        dtype = self.orig_linear.weight.dtype
+        self.lora_a = nn.Parameter(torch.zeros((in_features, dim), dtype=dtype, device=device))
+        self.lora_b = nn.Parameter(torch.zeros((dim, out_features), dtype=dtype, device=device))
+        if lora_A_init_method == 'xavier':
+            torch.nn.init.uniform_(self.lora_a)
+        else:
+            nn.init.kaiming_uniform_(self.lora_a, a=math.sqrt(5))
+
+        self.dropout = nn.Dropout(p=dropout)
+        assert dropout_position in ['pre', 'post'], dropout_position
+        self.dropout_position = dropout_position
+
+    def forward(self, x):
+        res = self.orig_linear(x)
+        if self.dropout_position == 'pre':
+            x = self.dropout(x)
+        lora_res = x @ self.lora_a
+        lora_res = lora_res @ self.lora_b
+        lora_res = lora_res * self.scale
+        if self.dropout_position == 'post':
+            lora_res = self.dropout(lora_res)
+        return res + lora_res
+
+
 @dataclass
 class LoRA(PEFT):
     """
@@ -82,6 +130,9 @@ class LoRA(PEFT):
                 - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention modules.
                 - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP.
                 - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP.
+            Target modules can also contain wildcards. For example, you can specify
+                target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv
+                on the first two layers.
         dim (int): Dimension of the low-rank projection space. Defaults to 32.
         alpha (int): Weighting factor for the low-rank projection. Defaults to 32.
         dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0.
@@ -129,37 +180,48 @@ def transform(self, m: nn.Module, name=None, prefix=None):
         """
         from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter
 
-        tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        if name in self.target_modules:
-            if name in ['linear_qkv', 'linear_fc1']:
-                # Column Parallel Linear
+        def wildcard_match(pattern, key):
+            if key is None:
+                return None
+            regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$")
+            match = regex_pattern.match(key)
+            return match is not None
+
+        full_name = f"{prefix}.{name}" if prefix else name
+        if name in self.target_modules or any(wildcard_match(pattern, full_name) for pattern in self.target_modules):
+            if HAVE_TE and isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear):
                 input_is_parallel = False
-                if HAVE_TE_COL_LINEAR and (
-                    isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear)
-                ):
-                    # m.in_features and m.out_features are divided by tp_size already,
-                    # but in_features and out_features passed to ParallelLinearAdapter are not.
-                    in_features = m.in_features
-                    out_features = m.out_features * tp_size
-                else:
-                    in_features = m.input_size
-                    out_features = m.output_size
+                # m.in_features and m.out_features are divided by tp_size already,
+                # but in_features and out_features passed to ParallelLinearAdapter are not.
+                tp_size = parallel_state.get_tensor_model_parallel_world_size()
+                in_features = m.in_features
+                out_features = m.out_features * tp_size
                 # LoRA is applied after layernorm, so layernorm output must be returned
                 m.return_layernorm_output = True
                 # perf optimization for LoRA + SP
                 if m.config.sequence_parallel and not m.ub_overlap_ag:
                     m.return_layernorm_output_gathered = True
-            else:  # name in ['linear_proj', 'linear_fc2']
-                # Row Parallel Linear
+            elif HAVE_TE and isinstance(m, TERowParallelLinear):
+                input_is_parallel = True
+                tp_size = parallel_state.get_tensor_model_parallel_world_size()
+                in_features = m.in_features * tp_size
+                out_features = m.out_features
+            elif isinstance(m, ColumnParallelLinear):
+                input_is_parallel = False
+                in_features = m.input_size
+                out_features = m.output_size
+            elif isinstance(m, RowParallelLinear):
                 input_is_parallel = True
-                if HAVE_TE_ROW_LINEAR and isinstance(m, TERowParallelLinear):
-                    in_features = m.in_features * tp_size
-                    out_features = m.out_features
-                else:
-                    in_features = m.input_size
-                    out_features = m.output_size
-
-            logging.info(f"Adding lora to: {prefix}.{name}")
+                in_features = m.input_size
+                out_features = m.output_size
+            elif isinstance(m, nn.Linear):
+                return LinearAdapter(
+                    m, dim=self.dim, alpha=self.alpha, dropout=self.dropout, lora_A_init_method=self.lora_A_init_method
+                )
+            else:
+                raise NotImplementedError(f"Layer type is unrecognized for LoRA: {type(m)}")
+
+            logging.info(f"Adding lora to: {full_name}")
             adapter = ParallelLinearAdapter(
                 in_features,
                 out_features,
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 6bee8c882ffd..21994b75f60d 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -14,6 +14,12 @@
 
 
 from nemo.collections.llm.recipes import (
+    baichuan2_7b,
+    chatglm3_6b,
+    gemma_2b,
+    gemma_7b,
+    gpt3_175b,
+    hf_auto_model_for_causal_lm,
     llama3_8b,
     llama3_8b_16k,
     llama3_8b_64k,
@@ -21,7 +27,15 @@
     llama3_70b_16k,
     llama3_70b_64k,
     llama31_405b,
-    mistral,
+    mamba2_1_3b,
+    mamba2_2_7b,
+    mamba2_8b,
+    mamba2_130m,
+    mamba2_370m,
+    mamba2_780m,
+    mamba2_hybrid_8b,
+    mistral_7b,
+    mistral_nemo_12b,
     mixtral_8x7b,
     mixtral_8x7b_16k,
     mixtral_8x7b_64k,
@@ -41,6 +55,10 @@
 from nemo.collections.llm.recipes.optim import adam
 
 __all__ = [
+    "baichuan2_7b",
+    "chatglm3_6b",
+    "gemma_2b",
+    "gemma_7b",
     "llama3_8b",
     "llama3_8b_16k",
     "llama3_8b_64k",
@@ -48,7 +66,16 @@
     "llama3_70b_16k",
     "llama3_70b_64k",
     "llama31_405b",
-    "mistral",
+    "mamba2_130m",
+    "mamba2_370m",
+    "mamba2_780m",
+    "mamba2_1_3b",
+    "mamba2_2_7b",
+    "mamba2_8b",
+    "mamba2_hybrid_8b",
+    "mistral_7b",
+    "mistral_nemo_12b",
+    "hf_auto_model_for_causal_lm",
     "mixtral_8x7b",
     "mixtral_8x7b_16k",
     "mixtral_8x7b_64k",
@@ -63,6 +90,7 @@
     "nemotron4_22b_16k",
     "nemotron4_22b_64k",
     "nemotron4_340b",
+    "gpt3_175b",
     "adam",
     "default_log",
     "default_resume",
diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py
new file mode 100644
index 000000000000..3ebb643af779
--- /dev/null
+++ b/nemo/collections/llm/recipes/baichuan2_7b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import Baichuan2Config7B, Baichuan2Model
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "baichuan2_7b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Baichuan2 7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Baichuan2 7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=baichuan2_7b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(Baichuan2Model, config=run.Config(Baichuan2Config7B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Baichuan2 7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=baichuan2_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Baichuan2 7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory baichuan2_7b
+            $ nemo llm pretrain --factory "baichuan2_7b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="baichuan2_7b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Baichuan2 7B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory baichuan2_7b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="baichuan2_7b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Baichuan2 7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory baichuan2_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="baichuan2_7b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(
+        model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py
new file mode 100644
index 000000000000..f5d580a9c6ea
--- /dev/null
+++ b/nemo/collections/llm/recipes/chatglm3_6b.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import ChatGLM3Config6B, ChatGLMModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "chatglm3_6b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a ChatGLM3 6B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the ChatGLM3 6B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=chatglm3_6b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(ChatGLMModel, config=run.Config(ChatGLM3Config6B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for ChatGLM3 6B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=chatglm3_6b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for ChatGLM3 6B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory chatglm3_6b
+            $ nemo llm pretrain --factory "chatglm3_6b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="chatglm3_6b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for ChatGLM3 6B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory chatglm3_6b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="chatglm3_6b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for ChatGLM3 6B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory chatglm3_6b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="chatglm3_6b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/finetune_default.py b/nemo/collections/llm/recipes/finetune_default.py
index 89c982613126..255763abbf50 100644
--- a/nemo/collections/llm/recipes/finetune_default.py
+++ b/nemo/collections/llm/recipes/finetune_default.py
@@ -60,7 +60,7 @@ def default_finetune_recipe(
         ),
         data=run.Config(llm.SquadDataModule, seq_length=2048, global_batch_size=128, micro_batch_size=1),
         log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
-        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50, adam_beta2=0.98),
         resume=nemo_resume(resume_path),
     )
 
@@ -77,9 +77,9 @@ def default_finetune_trainer(
     num_nodes=1,
     num_gpus_per_node=8,
     max_steps=1000,
-    limit_test_batches=None,
-    limit_val_batches=None,
-    val_check_interval=5,
+    limit_test_batches=1,
+    limit_val_batches=1,
+    val_check_interval=30,
 ):
     strategy = run.Config(
         nl.MegatronStrategy,
diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py
new file mode 100644
index 000000000000..cbcd340c1e92
--- /dev/null
+++ b/nemo/collections/llm/recipes/gemma_2b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import GemmaConfig2B, GemmaModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gemma_2b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Gemma 2B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Gemma 2B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gemma_2b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GemmaModel, config=run.Config(GemmaConfig2B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Gemma 2B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gemma_2b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Gemma 2B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gemma_2b
+            $ nemo llm pretrain --factory "gemma_2b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gemma_2b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Gemma 2B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory gemma_2b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="gemma_2b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Gemma 2B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory gemma_2b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="gemma_2b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    os.environ['NVTE_FUSED_ATTN'] = "0"
+
+    recipe = default_finetune_recipe(model(), "google/gemma-2b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py
new file mode 100644
index 000000000000..3b0e206d9ce7
--- /dev/null
+++ b/nemo/collections/llm/recipes/gemma_7b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import GemmaConfig7B, GemmaModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gemma_7b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Gemma 7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Gemma 7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gemma_7b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GemmaModel, config=run.Config(GemmaConfig7B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Gemma 7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gemma_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Gemma 7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gemma_7b
+            $ nemo llm pretrain --factory "gemma_7b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gemma_7b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Gemma 7B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory gemma_7b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="gemma_7b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Gemma 7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory gemma_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="gemma_7b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    os.environ['NVTE_FUSED_ATTN'] = "0"
+
+    recipe = default_finetune_recipe(model(), "google/gemma-7b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py
new file mode 100644
index 000000000000..1abe8a218e82
--- /dev/null
+++ b/nemo/collections/llm/recipes/gpt3_175b.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.model import GPTConfig175B, GPTModel
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
+    userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048,
+)
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gpt3_175b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a GPT3 175B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the GPT3 175B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gpt3_175b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GPTModel, config=run.Config(GPTConfig175B))
+
+
+def trainer(
+    tensor_parallelism: int = 4,
+    pipeline_parallelism: int = 8,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 6,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = True,
+    num_nodes: int = 64,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for GPT3 175B model.
+
+    This function sets up the distributed training strategy optimized for the large 175B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gpt3_175b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=64, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size efficiently.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for GPT3 175B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gpt3_175b
+            $ nemo llm pretrain --factory "gpt3_175b(num_nodes=64, name='my_175b_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gpt3_175b_pretrain", num_nodes=64)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for the large 175B model and requires significant computational resources.
+    """
+    recipe = run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=2048, global_batch_size=2048, micro_batch_size=2),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=0.9e-4),
+        resume=default_resume(),
+    )
+
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for GPT3 175B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=50,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+
+    return recipe
diff --git a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
new file mode 100644
index 000000000000..f3ac1d6975bc
--- /dev/null
+++ b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import pytorch_adam_with_cosine_annealing
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "hf_auto_model_for_causal_lm"
+
+
+@run.cli.factory(name=NAME)
+def model(model_name) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create HfAutoModelForCausalLM model configurations.
+
+    Args:
+        model_name (str): Model id on HF.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the HfAutoModelForCausalLM.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
+
+        Python API usage:
+            >>> model_config = model(model_name="mistralai/Mistral-Nemo-Instruct-2407")
+            >>> print(model_config)
+    """
+    return run.Config(HfAutoModelForCausalLM, model_name=model_name)
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 100,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+    strategy: Optional[str] = 'ddp',
+    gradient_clip_val: float = 1.0,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for HfAutoModelForCausalLM.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+        strategy: Optional[str] = 'ddp': Parallelism strategy.
+        gradient_clip_val: float = 1.0: gradient-clip value.
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=HfAutoModelForCausalLM ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+    """
+    strategy = str(strategy).lower()
+    assert strategy in ['', 'ddp', 'fsdp'], strategy
+    if strategy == 'fsdp':
+        # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
+        gradient_clip_val = None
+
+    trainer = run.Config(
+        nl.Trainer,
+        devices=num_gpus_per_node,
+        max_steps=max_steps,
+        accelerator='gpu',
+        strategy=strategy,
+        log_every_n_steps=1,
+        limit_val_batches=0.0,
+        num_sanity_val_steps=0,
+        accumulate_grad_batches=10,
+        callbacks=callbacks,
+        gradient_clip_val=gradient_clip_val,
+        use_distributed_sampler=False,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+    model_name: str = '',
+) -> run.Partial:
+    """
+    Create a pre-training recipe for a HfAutoModelForCausalLM model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="auto_pretrain", num_nodes=2, model_name="mistralai/Mistral-Nemo-Instruct-2407")
+            >>> print(recipe)
+    """
+    return run.Partial(
+        fn,
+        model=model(model_name, load_pretrained_weights=False),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=pytorch_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+    model_name: str = '',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for a HfAutoModelForCausalLM model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory hf_auto_model_for_causal_lm
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama3_8b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = run.Partial(
+        finetune,
+        model=model(model_name, load_pretrained_weights=True),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=pytorch_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index f36773551ea0..055e9a06fcba 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -27,6 +28,10 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
+    userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
+)
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama31_405b"
@@ -107,6 +112,14 @@ def trainer(
         gradient_as_bucket_view=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
     )
 
     trainer = run.Config(
@@ -131,7 +144,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3.1 405B model.
@@ -144,6 +162,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -161,7 +180,7 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 405B model and requires significant computational resources.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -174,3 +193,47 @@ def pretrain_recipe(
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
+
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Llama3.1 405B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=50,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index 9cfc198038f2..b283c68b222b 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -24,7 +24,6 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
@@ -64,7 +63,7 @@ def trainer(
     virtual_pipeline_parallelism: Optional[int] = 5,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
     callbacks: Optional[list[run.Config[Callback]]] = None,
@@ -117,6 +116,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -142,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 70B model.
@@ -155,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -172,7 +178,8 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 70B model and requires significant computational resources.
     """
-    return run.Partial(
+
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -186,40 +193,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 70B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
@@ -228,6 +230,8 @@ def pretrain_recipe_performance(
             tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
         )
     )
 
diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py
index c8c1957d7bdc..928f961f7cf3 100644
--- a/nemo/collections/llm/recipes/llama3_70b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_16k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for the large 70B model with longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -76,10 +76,10 @@ def trainer(
         This configuration uses extensive parallelism to handle the large model size and longer sequence length efficiently.
     """
     return llama3_70b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=8,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=2,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py
index 5d9845d9aaa7..ffadf5ca8084 100644
--- a/nemo/collections/llm/recipes/llama3_70b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_64k.py
@@ -21,7 +21,6 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import llama3_70b
 from nemo.utils.exp_manager import TimingCallback
 
@@ -59,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for the large 70B model with long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 32.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -81,7 +80,7 @@ def trainer(
         tensor_parallelism=8,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=8,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -106,8 +105,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 32.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index 4b2934739529..269eb7865dcf 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -117,6 +117,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -142,7 +143,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 8B model.
@@ -155,6 +161,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -173,7 +180,7 @@ def pretrain_recipe(
         For more details on pre-training LLMs with NeMo, see the pre-training
         guide in the `examples/llm/pretrain/` directory.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -187,44 +194,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_optimized")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 8B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory llama3_8b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
-
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py
index 0b42b392827a..d6c1677a3b4b 100644
--- a/nemo/collections/llm/recipes/llama3_8b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_16k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 1,
+    num_nodes: int = 2,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 2.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -76,10 +76,10 @@ def trainer(
         This configuration uses increased parallelism to handle the longer sequence length efficiently.
     """
     return llama3_8b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=2,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 1,
+    num_nodes: int = 2,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 2.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py
index 38f787113bf5..692347ea8dd0 100644
--- a/nemo/collections/llm/recipes/llama3_8b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_64k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -69,17 +69,17 @@ def trainer(
             $ nemo llm pretrain trainer=llama3_8b_64k ...
 
         Python API usage:
-            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
             >>> print(trainer_config)
 
     Note:
         This configuration uses significantly increased parallelism to handle the long sequence length efficiently.
     """
     return llama3_8b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=4,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
@@ -112,10 +112,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory llama3_8b_64k
-            $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=2, name='my_64k_pretrain')"
+            $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=4, name='my_64k_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=2)
+            >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=4)
             >>> print(recipe)
 
     Note:
diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py
index 93bd9f9470fa..d83580a1a543 100644
--- a/nemo/collections/llm/recipes/log/default.py
+++ b/nemo/collections/llm/recipes/log/default.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+from datetime import timedelta
 from typing import Optional
 
 from nemo_run import Config, cli
@@ -50,7 +51,7 @@ def default_log(
         nl.ModelCheckpoint,
         save_last=True,
         save_top_k=10,
-        every_n_train_steps=200,
+        train_time_interval=Config(timedelta, minutes=15),
         filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
     )
 
diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py
new file mode 100644
index 000000000000..08640604a112
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_130m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_130m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 130M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 130M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_130m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig130M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 130M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_130m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 130M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_130M
+            $ nemo llm pretrain --factory "mamba2_130M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_130M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 130M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_130m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_130m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig130M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig130M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py
new file mode 100644
index 000000000000..58eaf049b059
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_1_3b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_1_3b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 1.3B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 1.3B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_1_3B ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig1_3B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 1.3B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_1_3b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 1.3B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_1_3b
+            $ nemo llm pretrain --factory "mamba2_1_3b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_1_3b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 1.3B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_1_3b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_1_3b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig1_3B(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig1_3B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py
new file mode 100644
index 000000000000..5cb37c6a02a5
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_2_7b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_2_7b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 2.7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 2.7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_2_7B ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig2_7B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 2.7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_2_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 2.7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_2_7b
+            $ nemo llm pretrain --factory "mamba2_2_7b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_2_7b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 2.7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_2_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_2_7b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig2_7B(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig2_7B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py
new file mode 100644
index 000000000000..bb8bddc4045a
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_370m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_370m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 370M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 370M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_370m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig370M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 370M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_370m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 370M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_370M
+            $ nemo llm pretrain --factory "mamba2_370M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_370M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 370M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_370m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_370m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig370M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig370M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py
new file mode 100644
index 000000000000..2f6ab6717ae1
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_780m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_780m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 780M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 780M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_780m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig780M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 780M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_780m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 780M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_780M
+            $ nemo llm pretrain --factory "mamba2_780M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_780M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 780M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_780m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_780m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig780M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig780M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py
new file mode 100644
index 000000000000..58883deba732
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_8b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_8b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='megatron',
+        model_name="GPTSentencePieceTokenizer",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 8B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_8b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.NVIDIAMambaConfig8B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 8,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_8b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_8b
+            $ nemo llm pretrain --factory "mamba2_8b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_8b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    resume_path,
+    tokenizer_model,
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 8B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_8b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_8b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.NVIDIAMambaConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.NVIDIAMambaConfig8B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=8,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 8
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
new file mode 100644
index 000000000000..eff37da46fca
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_hybrid_8b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='megatronNVIDIAMambaConfig8B',
+        model_name="GPTSentencePieceTokenizer",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 Hybrid 8B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_hybrid_8b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel,
+        config=run.Config(llm.NVIDIAMambaHybridConfig8B),
+        tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 8,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_hybrid_8b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_hybrid_8b
+            $ nemo llm pretrain --factory "mamba2_hybrid_8b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_hybrid_8b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    resume_path,
+    tokenizer_model,
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_hybrid_8b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_hybrid_8b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.NVIDIAMambaHybridConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.NVIDIAMambaHybridConfig8B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=8,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 8
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral_7b.py
similarity index 99%
rename from nemo/collections/llm/recipes/mistral.py
rename to nemo/collections/llm/recipes/mistral_7b.py
index 2b8c42e54ee7..6e82df598140 100644
--- a/nemo/collections/llm/recipes/mistral.py
+++ b/nemo/collections/llm/recipes/mistral_7b.py
@@ -33,7 +33,7 @@
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.utils.exp_manager import TimingCallback
 
-NAME = "mistral"
+NAME = "mistral_7b"
 
 
 @run.cli.factory(name=NAME)
diff --git a/nemo/collections/llm/recipes/mistral_nemo_12b.py b/nemo/collections/llm/recipes/mistral_nemo_12b.py
new file mode 100644
index 000000000000..e74fa5435b62
--- /dev/null
+++ b/nemo/collections/llm/recipes/mistral_nemo_12b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mistral_nemo_base_12b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mistral-Nemo-Base-12B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mistral-Nemo-Base-12B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mistral_nemo_base_12b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(MistralModel, config=run.Config(MistralNeMoConfig12B))
+
+
+def trainer(
+    tensor_parallelism: int = 2,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = True,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mistral-Nemo-Base-12B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mistral_nemo_base_12b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mistral-Nemo-Base-12B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mistral_nemo_base_12b
+            $ nemo llm pretrain --factory "mistral_nemo_base_12b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mistral_nemo_base_12b", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Mistral-Nemo-Base-12B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory mistral_nemo_base_12b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="mistral_nemo_base_12b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mistral-Nemo-Base-12B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mistral_nemo_base_12b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mistral_nemo_base_12b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(
+        model(), "mistralai/Mistral-Nemo-Base-2407", dir, name, num_nodes, num_gpus_per_node
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
index 222a37d7a0c5..b9db14324c97 100644
--- a/nemo/collections/llm/recipes/mixtral_8x22b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -117,6 +117,9 @@ def trainer(
             DistributedDataParallelConfig,
             check_for_nan_in_grad=True,
             grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -142,7 +145,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 16, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 16,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Mixtral 8x22B model.
@@ -155,6 +163,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -169,7 +178,7 @@ def pretrain_recipe(
             >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=16)
             >>> print(recipe)
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -181,47 +190,49 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Mixtral 8x22B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "mixtral_8x22b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="mixtral_8x22b_perf", num_nodes=8)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
     recipe.trainer.callbacks.extend(
         [
-            run.Config(MegatronTokenDropCallback),
-            run.Config(MegatronCommOverlapCallback),
+            run.Config(
+                MegatronTokenDropCallback,
+            ),
+            run.Config(
+                MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=True, align_param_gather=True
+            ),
         ]
     )
-
+    recipe.trainer.strategy.expert_model_parallel_size = 1
+    recipe.trainer.strategy.tensor_model_parallel_size = 8
+    recipe.trainer.strategy.sequence_parallel = True
     return recipe
 
 
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
index d0609761feea..2a23512c647b 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -116,6 +116,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -141,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Mixtral 8x7B model.
@@ -154,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -168,7 +175,7 @@ def pretrain_recipe(
             >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=8)
             >>> print(recipe)
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -180,47 +187,49 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Mixtral 8x7B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "mixtral_8x3b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="mixtral_8x7b_perf", num_nodes=8)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
     recipe.trainer.callbacks.extend(
         [
             run.Config(MegatronTokenDropCallback),
-            run.Config(MegatronCommOverlapCallback),
+            run.Config(
+                MegatronCommOverlapCallback,
+                overlap_param_gather_with_optimizer_step=True,
+                align_param_gather=True,
+            ),
         ]
     )
-
+    recipe.trainer.strategy.expert_model_parallel_size = 1
+    recipe.trainer.strategy.tensor_model_parallel_size = 8
+    recipe.trainer.strategy.sequence_parallel = True
     return recipe
 
 
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
index 8b26a8c7c3e3..7cbfaf723544 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
@@ -51,7 +51,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -60,8 +60,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -71,17 +71,17 @@ def trainer(
             $ nemo llm pretrain trainer=mixtral_8x7b_16k ...
 
         Python API usage:
-            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
             >>> print(trainer_config)
 
     Note:
         This configuration uses increased parallelism to handle the longer sequence length efficiently.
     """
     return mixtral_8x7b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=8,
+        virtual_pipeline_parallelism=None,
         context_parallelism=4,
         sequence_parallelism=True,
         expert_parallelism=1,
@@ -95,7 +95,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -107,8 +107,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
@@ -116,10 +116,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory mixtral_8x7b_16k
-            $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_pretrain')"
+            $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=4, name='my_16k_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=2)
+            >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=4)
             >>> print(recipe)
     """
     recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
index 6c8f7077fba3..3606be5ec12b 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
@@ -21,7 +21,6 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import mixtral_8x7b
 from nemo.utils.exp_manager import TimingCallback
 
@@ -59,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for very long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 8.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -78,11 +77,11 @@ def trainer(
         It requires a substantial amount of computational resources.
     """
     return mixtral_8x7b.trainer(
-        tensor_parallelism=4,
+        tensor_parallelism=8,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=4,
-        context_parallelism=8,
+        virtual_pipeline_parallelism=None,
+        context_parallelism=4,
         sequence_parallelism=True,
         expert_parallelism=1,
         num_nodes=num_nodes,
@@ -107,8 +106,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 16.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/nemotron.py b/nemo/collections/llm/recipes/nemotron.py
index 1dd1ef2f83bc..aedf3fcf2954 100644
--- a/nemo/collections/llm/recipes/nemotron.py
+++ b/nemo/collections/llm/recipes/nemotron.py
@@ -17,6 +17,7 @@
 import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -124,6 +125,14 @@ def nemotron_trainer(
         ckpt_include_optimizer=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
     )
 
     precision_plugin = None
diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py
index 3cdb647b5f84..7dcebe17f872 100644
--- a/nemo/collections/llm/recipes/nemotron3_8b.py
+++ b/nemo/collections/llm/recipes/nemotron3_8b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -26,6 +26,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron3_8b"
@@ -82,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=3.0e-5,
     max_lr=3e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -117,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -134,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -173,6 +176,38 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron3 8B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
+
 
 @run.cli.factory(name=NAME + "_nemo")
 def nemo_resume() -> run.Config[nl.AutoResume]:
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
index c0acae6b13f0..16ae7b2b1e79 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -23,6 +23,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_15b"
@@ -79,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=4.5e-5,
     max_lr=4.5e-5,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -114,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -131,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -169,3 +172,34 @@ def pretrain_recipe(
         ),
         resume=default_resume(),
     )
+
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 15B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
diff --git a/nemo/collections/llm/recipes/nemotron4_15b_16k.py b/nemo/collections/llm/recipes/nemotron4_15b_16k.py
index d0e9d939d8e7..75eced72761f 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b_16k.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b_16k.py
@@ -56,7 +56,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 2,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
diff --git a/nemo/collections/llm/recipes/nemotron4_15b_64k.py b/nemo/collections/llm/recipes/nemotron4_15b_64k.py
index c3f4575a1fd6..8286778aa7ba 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b_64k.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b_64k.py
@@ -56,7 +56,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 4,
     sequence_parallelism: bool = True,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py
index ba07bae241d8..a20afedfea56 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -23,6 +23,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_22b"
@@ -56,7 +57,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 2,
     pipeline_parallelism: int = 4,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 10,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
@@ -79,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1e-5,
     max_lr=1e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -114,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -131,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -169,3 +172,45 @@ def pretrain_recipe(
         ),
         resume=default_resume(),
     )
+
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 22B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+    return recipe
diff --git a/nemo/collections/llm/recipes/nemotron4_22b_16k.py b/nemo/collections/llm/recipes/nemotron4_22b_16k.py
index 614004d12aa3..42f258c6057d 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b_16k.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b_16k.py
@@ -57,7 +57,7 @@ def pretrain_recipe(
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 1,
     pipeline_parallelism_type: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = 10,
+    virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
     num_nodes: int = 1,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b_64k.py b/nemo/collections/llm/recipes/nemotron4_22b_64k.py
index 57211e5dddc1..67d60a6e1c90 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b_64k.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b_64k.py
@@ -56,9 +56,9 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = 10,
-    context_parallelism: int = 2,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 4,
     sequence_parallelism: bool = True,
     num_nodes: int = 4,
     num_gpus_per_node: int = 8,
@@ -122,10 +122,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory nemotron4_22b_64k
-            $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=1, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
+            >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=2)
             >>> print(recipe)
 
     Note:
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
index 238acb0dac3c..8268b2a87791 100644
--- a/nemo/collections/llm/recipes/nemotron4_340b.py
+++ b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -26,6 +26,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_340b"
@@ -41,7 +42,7 @@ def model() -> run.Config[pl.LightningModule]:
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain model=nemotron4_340 ...
+            $ nemo llm pretrain model=nemotron4_340b ...
 
         Python API usage:
             >>> model_config = model()
@@ -59,7 +60,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 8,
     pipeline_parallelism: int = 12,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 8,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
@@ -82,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1.0e-5,
     max_lr=1.0e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -117,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -124,8 +127,8 @@ def pretrain_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain --factory nemotron4_340
-            $ nemo llm pretrain --factory "nemotron4_340(num_nodes=1, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory nemotron4_340b
+            $ nemo llm pretrain --factory "nemotron4_340b(num_nodes=1, name='my_nemotron_pretrain')"
 
         Python API usage:
             >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
@@ -134,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -173,6 +176,48 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 340B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+    return recipe
+
 
 @run.cli.factory(name=NAME + "_nemo")
 def nemo_resume() -> run.Config[nl.AutoResume]:
@@ -207,7 +252,7 @@ def finetune_recipe(
     # Trainer
     tensor_parallelism: int = 8,
     pipeline_parallelism: int = 12,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 8,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
@@ -272,8 +317,8 @@ def finetune_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm finetune --factory nemotron4_340
-            $ nemo llm finetune --factory "nemotron4_340(name='my_nemotron4_340_finetune', num_nodes=4)"
+            $ nemo llm finetune --factory nemotron4_340b
+            $ nemo llm finetune --factory "nemotron4_340b(name='my_nemotron4_340_finetune', num_nodes=4)"
 
         Python API usage:
             >>> recipe = finetune_recipe(name="my_nemotron4_340_finetune", num_nodes=4)
diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py
index 5be87ac71e9d..4148d19c6635 100644
--- a/nemo/collections/llm/recipes/optim/adam.py
+++ b/nemo/collections/llm/recipes/optim/adam.py
@@ -17,7 +17,12 @@
 import nemo_run as run
 from megatron.core.optimizer import OptimizerConfig
 
-from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule, OptimizerModule
+from nemo.lightning.pytorch.optim import (
+    CosineAnnealingScheduler,
+    MegatronOptimizerModule,
+    OptimizerModule,
+    PytorchOptimizerModule,
+)
 
 
 @run.cli.factory
@@ -25,6 +30,8 @@ def distributed_fused_adam_with_cosine_annealing(
     precision: str = "bf16-mixed",  # or "16-mixed"
     warmup_steps: int = 2000,
     constant_steps: int = 0,
+    adam_beta1: float = 0.9,
+    adam_beta2: float = 0.95,
     max_lr: float = 1e-4,
     min_lr: Optional[float] = None,
     clip_grad: float = 1.0,
@@ -37,14 +44,14 @@ def distributed_fused_adam_with_cosine_annealing(
         weight_decay=0.1,
         bf16=precision == "bf16-mixed",
         fp16=precision == "16-mixed",
-        adam_beta1=0.9,
-        adam_beta2=0.95,
+        adam_beta1=adam_beta1,
+        adam_beta2=adam_beta2,
         adam_eps=1e-5,
         use_distributed_optimizer=True,
         clip_grad=clip_grad,
     )
 
-    min_lr = min_lr or (0.1 * max_lr)
+    min_lr = min_lr if min_lr is not None else (0.1 * max_lr)
     sched = run.Config(
         CosineAnnealingScheduler,
         warmup_steps=warmup_steps,
@@ -57,3 +64,55 @@ def distributed_fused_adam_with_cosine_annealing(
         config=opt_cfg,
         lr_scheduler=sched,
     )
+
+
+@run.cli.factory
+def pytorch_adam_with_cosine_annealing(
+    precision: str = "bf16-mixed",  # or "16-mixed"
+    warmup_steps: int = 2000,
+    constant_steps: int = 0,
+    max_lr: float = 1e-5,
+    min_lr: Optional[float] = None,
+    clip_grad: float = 1.0,
+) -> run.Config[OptimizerModule]:
+    from torch.optim import Adam
+
+    return run.Config(
+        PytorchOptimizerModule,
+        optim_cls=Adam,
+        config=dict(
+            lr=max_lr,
+            weight_decay=0.1,
+            betas=(0.9, 0.95),
+            eps=1e-8,
+        ),
+        lr_scheduler=run.Config(
+            CosineAnnealingScheduler,
+            warmup_steps=warmup_steps,
+            constant_steps=constant_steps,
+            min_lr=min_lr or (0.1 * max_lr),
+        ),
+    )
+
+
+@run.cli.factory
+def pytorch_adam_with_flat_lr(
+    precision: str = "bf16-mixed",  # or "16-mixed"
+    warmup_steps: int = 2000,
+    constant_steps: int = 0,
+    max_lr: float = 1e-5,
+    min_lr: Optional[float] = None,
+    clip_grad: float = 1.0,
+) -> run.Config[OptimizerModule]:
+    from torch.optim import Adam
+
+    return run.Config(
+        PytorchOptimizerModule,
+        optim_cls=Adam,
+        config=dict(
+            lr=max_lr,
+            weight_decay=0.1,
+            betas=(0.9, 0.95),
+            eps=1e-8,
+        ),
+    )
diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py
index b1315f7a708a..9326dabe7b84 100644
--- a/nemo/collections/llm/t5/data/fine_tuning.py
+++ b/nemo/collections/llm/t5/data/fine_tuning.py
@@ -61,8 +61,6 @@ def __init__(
         from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
-        additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
-        self.tokenizer.add_special_tokens(additional_tokens)
 
         self.memmap_workers = memmap_workers
         self.num_workers = num_workers
diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index 2c73e0b78b11..e6f619972284 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -130,10 +130,6 @@ def __init__(
         # add additional tokens for T5 tokenizer
         from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
-        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
-        additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
-        self.tokenizer.add_special_tokens(additional_tokens)
-
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
             micro_batch_size=micro_batch_size,
diff --git a/nemo/collections/multimodal/data/energon/base.py b/nemo/collections/multimodal/data/energon/base.py
index 34752c878b1d..64d509ee02bf 100644
--- a/nemo/collections/multimodal/data/energon/base.py
+++ b/nemo/collections/multimodal/data/energon/base.py
@@ -11,17 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from copy import deepcopy
 from typing import TYPE_CHECKING, Any, Dict, Literal, Optional
 
+import fiddle as fdl
 import pytorch_lightning as pl
 from megatron.core import parallel_state
 from megatron.energon import WorkerConfig, get_savable_loader, get_train_dataset
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils.data import DataLoader
+from typing_extensions import Self
 
 from nemo.collections.multimodal.data.energon.config import MultiModalSampleConfig
 from nemo.collections.multimodal.data.energon.task_encoder import MultiModalTaskEncoder
-from nemo.lightning.io.mixin import IOMixin
+from nemo.lightning.io.mixin import IOMixin, serialization, track_io
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 from nemo.utils import logging
 
@@ -104,6 +107,14 @@ def __init__(
         self.train_dataloader_object = None
         self.val_dataloader_object = None
 
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items() if k not in ['image_processor', 'task_encoder']}
+        for val in cfg_kwargs.values():
+            if not serialization.find_node_traverser(type(val)):
+                track_io(type(val))
+        cfg = fdl.Config(type(self), **cfg_kwargs)
+        return cfg
+
     def datasets_provider(self, worker_config, split: Literal['train', 'val'] = 'val'):
         """
         Provide the dataset for training or validation.
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
index 17ffc01fb7f4..4ce9701e76b4 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
@@ -209,7 +209,7 @@ def create_masked_lm_predictions(
     # on-the-fly whole word masking is possible.
     token_boundary = [0] * len(tokens)
     skip_mask_idx = None  # Store the index of token that cannot be masked.
-    for (i, token) in enumerate(tokens):
+    for i, token in enumerate(tokens):
         if token == skip_masking_id:
             skip_mask_idx = i
         if token == cls_id or token == sep_id:
@@ -285,7 +285,10 @@ def create_masked_lm_predictions(
             available_ngrams = list(cand_index_set.keys())
             # n - 1 because pvals is 0-indexed and available ngrams are 1-indexed.
             pvals_current = np.array([pvals[n - 1] for n in available_ngrams])
-            n = np_rng.choice(available_ngrams, p=pvals_current / pvals_current.sum(keepdims=True),)
+            n = np_rng.choice(
+                available_ngrams,
+                p=pvals_current / pvals_current.sum(keepdims=True),
+            )
         else:
             # Sampling "n" from the geometric distribution and clipping it to
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
@@ -488,7 +491,10 @@ def create_extreme_masked_lm_predictions(
         if span_length_distribution == LengthDistribution.uniform:
             available_ngrams = list(cand_index_set.keys())
             pvals_current = np.array([pvals[n] for n in available_ngrams])
-            n = np_rng.choice(available_ngrams, p=pvals_current / pvals_current.sum(keepdims=True),)
+            n = np_rng.choice(
+                available_ngrams,
+                p=pvals_current / pvals_current.sum(keepdims=True),
+            )
         elif span_length_distribution == LengthDistribution.geometric:
             # Sampling "n" from the geometric distribution and clipping it to
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
@@ -914,7 +920,13 @@ def build_train_valid_test_datasets(
                 seed,
             )
             test_ds = MockT5Dataset(
-                cfg, tokenizer, "test", int(train_valid_test_num_samples[2]), max_seq_length, max_seq_length_dec, seed,
+                cfg,
+                tokenizer,
+                "test",
+                int(train_valid_test_num_samples[2]),
+                max_seq_length,
+                max_seq_length_dec,
+                seed,
             )
             return train_ds, valid_ds, test_ds
         else:
@@ -1257,6 +1269,7 @@ def get_samples_mapping(
     binary_head,
     index_mapping_dir: str = None,
     samples_mapping: Any = None,
+    sanity_check_dist_workers: bool = True,
 ):
     """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""
 
@@ -1328,14 +1341,16 @@ def get_samples_mapping(
         logging.info(
             ' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time)
         )
-    torch.distributed.barrier()
-    counts = torch.cuda.LongTensor([1])
-    torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group(with_context_parallel=True))
-    torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group())
-    assert counts[0].item() == (
-        torch.distributed.get_world_size()
-        // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group())
-    )
+
+    if sanity_check_dist_workers:
+        torch.distributed.barrier()
+        counts = torch.cuda.LongTensor([1])
+        torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group(with_context_parallel=True))
+        torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group())
+        assert counts[0].item() == (
+            torch.distributed.get_world_size()
+            // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group())
+        )
     # Load indexed dataset if not given externally.
     if samples_mapping is None:
         logging.info(' > loading indexed mapping from {}'.format(indexmap_filename))
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index c42249cec2f2..898ddb7d716b 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -64,6 +64,7 @@ def __init__(
         output_original_text: bool = False,
         ceil_to_power_2: bool = False,
         get_attention_mask_from_fusion: bool = False,
+        sanity_check_dist_workers: bool = True,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
@@ -89,6 +90,7 @@ def __init__(
         special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
         is_test: Whether this dataset is the test split.
         output_original_text (bool): if true, will keep the original text in the output alongside the tokenized ids.
+        sanity_check_dist_workers (bool): if true, will run sanity check across workers when making mapping.
         """
         self.tokenizer = tokenizer
         self.file_path = file_path
@@ -117,6 +119,7 @@ def __init__(
         self.output_original_text = output_original_text
         self.ceil_to_power_2 = ceil_to_power_2
         self.get_attention_mask_from_fusion = get_attention_mask_from_fusion
+        self.sanity_check_dist_workers = sanity_check_dist_workers
 
         if special_tokens is None:
             self.special_tokens = {
@@ -196,6 +199,7 @@ def _build_samples_mapping(self):
                 binary_head=False,
                 index_mapping_dir=self.index_mapping_dir,
                 samples_mapping=osm,
+                sanity_check_dist_workers=self.sanity_check_dist_workers,
             )
         else:
             self.samples_mapping = None
diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
index 4882708f698f..f62613db891b 100644
--- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
@@ -127,7 +127,7 @@ def __init__(
                 index_mapping_dir=index_mapping_dir,
             )
 
-        if is_distributed:
+        if is_distributed and not _lightning_prepare_data():
             torch.distributed.barrier()
 
         if is_distributed and AppState().local_rank == 0:
@@ -152,7 +152,7 @@ def __init__(
                 index_mapping_dir=index_mapping_dir,
             )
 
-        if is_distributed:
+        if is_distributed and not _lightning_prepare_data():
             torch.distributed.barrier()
 
         logging.info(f"Loading data files")
@@ -260,7 +260,8 @@ def load_file(self, fn, index_mapping_dir: Optional[str] = None):
                 raise RuntimeError(f"Missing header, expected {self._header_lines} header lines")
 
             # load meta info
-            idx_info_dict = pickle.load(open(idx_fn + ".info", "rb"))
+            with open(idx_fn + ".info", "rb") as fp:
+                idx_info_dict = pickle.load(fp)
             # test for mismatch in expected newline_int
             if "newline_int" in idx_info_dict:
                 newline_int = idx_info_dict["newline_int"]
@@ -378,9 +379,7 @@ def __init__(
         self._data_sep = data_sep
 
     def _build_data_from_text(self, text: str):
-        """
-
-        """
+        """ """
         _build_data_from_text = super()._build_data_from_text
         data = {}
         text_fields = text.split(self._data_sep)
@@ -513,7 +512,11 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir
 
 
 def build_index_files(
-    dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None,
+    dataset_paths,
+    newline_int,
+    workers=None,
+    build_index_fn=_build_index_from_memdata,
+    index_mapping_dir: str = None,
 ):
     """Auxiliary method to build multiple index files"""
     if len(dataset_paths) < 1:
@@ -528,7 +531,12 @@ def build_index_files(
     ctx = mp.get_context("fork")
     with ctx.Pool(workers) as p:
         build_status = p.map(
-            partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir,),
+            partial(
+                _build_memmap_index_files,
+                newline_int,
+                build_index_fn,
+                index_mapping_dir=index_mapping_dir,
+            ),
             dataset_paths,
         )
 
@@ -741,3 +749,19 @@ def get_sample_block(self, block_idx: int) -> np.ndarray:
         sample_block = sample_block % self.dataset_size
 
         return sample_block
+
+
+def _lightning_prepare_data():
+    """
+    This function checks whether it is invoked in lightning's hook "prepare_data", which is run only on rank 0.
+    TextMemMapDataset contains a torch.distributed.barrier operation, so when run inside the single-process hook
+    prepare_data, the barrier operation would hang forever.
+    """
+    import inspect
+
+    return any(
+        [
+            frame.function == 'prepare_data' and 'prepare_packed_sequence_data' in frame.code_context[0]
+            for frame in inspect.stack()
+        ]
+    )
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index a547d593d6d7..042dbb95979e 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -256,7 +256,11 @@ def __init__(
 
             te_version = packaging.version.Version(version("transformer-engine"))
             if te_version >= packaging.version.Version("1.5.0dev") and (
-                not self.input_is_parallel and getattr(model_parallel_config, "tp_comm_overlap_disable_qkv", False)
+                not self.input_is_parallel
+                and (
+                    not getattr(model_parallel_config, "tp_comm_overlap", False)
+                    or getattr(model_parallel_config, "tp_comm_overlap_disable_qkv", False)
+                )
             ):
                 # TE 1.5 introduces the option `return_layernorm_output_gathered`, so the all gather
                 # in the forward method is not needed, so set self._sequence_parallel to False
diff --git a/nemo/collections/nlp/modules/common/text_generation_server.py b/nemo/collections/nlp/modules/common/text_generation_server.py
index 6c257317b99f..3f8e34b94134 100644
--- a/nemo/collections/nlp/modules/common/text_generation_server.py
+++ b/nemo/collections/nlp/modules/common/text_generation_server.py
@@ -15,11 +15,17 @@
 
 import json
 import threading
+import time
+import uuid
 
 import torch
 from flask import Flask, jsonify, request
 from flask_restful import Api, Resource
 
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import (
+    _get_header_conversation_type_mask_role,
+    get_prompt_template_example,
+)
 from nemo.collections.nlp.modules.common.retro_inference_strategies import (
     RetroModelTextGenerationStrategy,
     RetroQAModelTextGenerationStrategy,
@@ -61,6 +67,189 @@ def send_do_generate():
         choice = torch.cuda.LongTensor([GENERATE_NUM])
         torch.distributed.broadcast(choice, 0)
 
+    def convert_messages(self, input_list):
+        output_dict = {
+            'system': '',
+            'conversations': [],
+            'mask': 'User',
+            'type': 'VALUE_TO_TEXT',
+        }
+
+        # Extract the system message
+        for msg in input_list:
+            if msg['role'] == 'system':
+                output_dict['system'] = msg['content']
+                break  # Assuming only one system message
+
+        # Build the conversations list
+        for msg in input_list:
+            if msg['role'] != 'system':
+                conversation_entry = {
+                    'from': msg['role'].capitalize(),  # Capitalize 'user' and 'assistant'
+                    'value': msg['content'],
+                    'label': None,
+                }
+                output_dict['conversations'].append(conversation_entry)
+
+        return output_dict
+
+    def completion(self, data):
+        output_sentence = ""
+        with lock:  # Need to get lock to keep multiple threads from hitting code
+            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
+            extra = {}
+            if self.inference_strategy is not None:
+                extra['strategy'] = self.inference_strategy
+
+            all_probs = False
+            add_BOS = False
+            top_p = data.get("top_p", 1.0)
+            top_k = data.get("top_k", 0)
+            max_tokens = data.get("max_tokens", 32)
+            temperature = data.get("temperature", 0.0)
+            logprobs = data.get("logprobs", False)
+            greedy = temperature == 0.0
+            end_strings = ['<|endoftext|>'] + data.get("end_strings", [])
+            prompt = data["prompt"]
+            random_seed = data.get("seed", 1234)
+
+            output = generate(
+                self.model,
+                [prompt],
+                tokens_to_generate=max_tokens,
+                all_probs=all_probs,
+                temperature=temperature,
+                add_BOS=add_BOS,
+                top_k=top_k,
+                top_p=top_p,
+                greedy=greedy,
+                repetition_penalty=1.0,
+                end_strings=end_strings,
+                min_tokens_to_generate=0,
+                compute_logprob=logprobs,
+                random_seed=random_seed,
+                **extra,
+            )
+            for k in output:
+                if isinstance(output[k], torch.Tensor):
+                    output[k] = output[k].tolist()
+
+            output_sentence = output['sentences'][0][len(prompt) :]
+            tokens = output['tokens'][0]
+            logprobs = output['logprob'][0] if output['logprob'] is not None else None
+            num_prompt_tokens = len(prompt.split())
+            num_output_sentence = len(output_sentence.split())
+
+        return jsonify(
+            {
+                "choices": [
+                    {
+                        "finish_reason": "",
+                        "index": 0,
+                        "logprobs": logprobs,
+                        "text": output_sentence,
+                        "tokens": tokens,
+                    }
+                ],
+                "created": int(time.time()),
+                "id": f"cmpl-{uuid.uuid4()}",
+                "model": "nemo model",
+                "object": "text_completion",
+                "usage": {
+                    "completion_tokens": num_output_sentence,
+                    "prompt_tokens": num_prompt_tokens,
+                    "total_tokens": num_output_sentence + num_prompt_tokens,
+                },
+            }
+        )
+
+    def chat_completion(self, data):
+        data['messages'] = data['messages'] + [
+            {'role': 'assistant', 'content': ''}
+        ]  # adding trailing assistant message so that prompt ends with Assistant tag.
+        special_tokens = self.model.cfg.data.chat_prompt_tokens
+        nemo_source = self.convert_messages(data['messages'])
+        header, conversation, data_type, mask_role = _get_header_conversation_type_mask_role(
+            nemo_source, special_tokens
+        )
+        len_strip = len(special_tokens['end_of_turn'] + special_tokens['turn_start'])
+        conversation = conversation[:-len_strip]
+        # Return a response mimicking the OpenAI ChatCompletion API format
+        with lock:  # Need to get lock to keep multiple threads from hitting code
+            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
+            extra = {}
+            if self.inference_strategy is not None:
+                extra['strategy'] = self.inference_strategy
+
+            all_probs = False
+            add_BOS = False
+            top_k = 0
+            greedy = data['temperature'] == 0.0
+            logprobs = data.get("logprobs", False)
+            end_strings = ['<|endoftext|>', special_tokens['turn_start'], special_tokens['label_start']]
+            random_seed = None
+
+            output = generate(
+                self.model,
+                [conversation],
+                data.get('max_tokens', 32),
+                all_probs=all_probs,
+                temperature=data.get('temperature', 1.0),
+                add_BOS=add_BOS,
+                top_k=top_k,
+                top_p=data.get("top_p", 0.95),
+                greedy=greedy,
+                repetition_penalty=1.0,
+                end_strings=end_strings,
+                min_tokens_to_generate=0,
+                compute_logprob=logprobs,
+                random_seed=random_seed,
+                **extra,
+            )
+            for k in output:
+                if isinstance(output[k], torch.Tensor):
+                    output[k] = output[k].tolist()
+
+        output_sentence = output['sentences'][0][len(conversation) :]
+        tokens = output['tokens'][0]
+        logprobs = output['logprob'][0] if output['logprob'] is not None else None
+        num_prompt_tokens = len(conversation.split())  # @adithyare only produces an approx. number of tokens
+        num_output_sentence = len(output_sentence.split())
+
+        return jsonify(
+            {
+                "id": f"chatcmpl-{uuid.uuid4()}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": data.get("model", "nemo model"),
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {"role": "assistant", "content": output_sentence},
+                        "logprobs": logprobs,
+                        "tokens": tokens,
+                        "finish_reason": "",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": num_prompt_tokens,
+                    "completion_tokens": num_output_sentence,
+                    "total_tokens": num_output_sentence + num_prompt_tokens,
+                },
+            }
+        )
+
+    def post(self):
+        # Access the request data if needed
+        if request.endpoint == "oai_completions":
+            data = request.get_json()
+            return self.completion(data)
+        elif request.endpoint == "oai_chat_completions":
+            data = request.get_json()
+            return self.chat_completion(data)
+        else:
+            raise RuntimeError("Unknown enpoint requested.")
+
     def put(self):
         logging.info("request IP: " + str(request.remote_addr))
         logging.info(json.dumps(request.get_json()))
@@ -135,7 +324,7 @@ def put(self):
             if not (0.0 <= top_p <= 1.0):
                 return "top_p must be a positive number less than or equal to 1.0"
 
-        repetition_penalty = 1.2
+        repetition_penalty = 1.0
         if "repetition_penalty" in request.get_json():
             repetition_penalty = request.get_json()["repetition_penalty"]
             if not (type(repetition_penalty) == int or type(repetition_penalty) == float):
@@ -231,7 +420,24 @@ class MegatronServer(object):
     def __init__(self, model, inference_strategy=None):
         self.app = Flask(__name__, static_url_path='')
         api = Api(self.app)
-        api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model, inference_strategy])
+        api.add_resource(
+            MegatronGenerate,
+            '/generate',
+            endpoint="generate",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
+        api.add_resource(
+            MegatronGenerate,
+            '/v1/completions',
+            endpoint="oai_completions",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
+        api.add_resource(
+            MegatronGenerate,
+            '/v1/chat/completions',
+            endpoint="oai_chat_completions",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
 
     def run(self, url, port=5000):
         self.app.run(url, threaded=True, port=port, debug=False)
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index 4e6f9e15b839..dfc55a6c9065 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -69,7 +69,8 @@ def get_tokenizer(
             To see the list of all HuggingFace pretrained models, use:
             nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list()
         tokenizer_model: tokenizer model file of sentencepiece
-        special_tokens: dict of special tokens
+        special_tokens: dict of special tokens.
+            For additional special tokens besides standard special tokens (bos, eos, pad, etc.), such as sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.), use key 'additional_special_tokens'
         vocab_file: path to vocab file
         use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
         bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation
@@ -224,7 +225,11 @@ def get_nmt_tokenizer(
             f'Getting Megatron tokenizer for pretrained model name: {model_name}, custom vocab file: {vocab_file}, and merges file: {merges_file}'
         )
         return get_tokenizer(
-            tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file, chat_template=chat_template
+            tokenizer_name=model_name,
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            special_tokens=special_tokens_dict,
+            chat_template=chat_template,
         )
     elif library == 'tabular':
         from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
diff --git a/nemo/collections/tts/data/dataset.py b/nemo/collections/tts/data/dataset.py
index 83d2b969ea91..901b4168130f 100644
--- a/nemo/collections/tts/data/dataset.py
+++ b/nemo/collections/tts/data/dataset.py
@@ -204,7 +204,8 @@ def __init__(
             self.text_normalizer_call = None
         elif not PYNINI_AVAILABLE:
             raise ImportError(
-                "`nemo_text_processing` is not installed, see https://github.com/NVIDIA/NeMo-text-processing for details"
+                "`nemo_text_processing` is not installed, see https://github.com/NVIDIA/NeMo-text-processing for details. "
+                "If you wish to continue without text normalization, please remove the text_normalizer part in your TTS yaml file."
             )
         else:
             self.text_normalizer_call = (
diff --git a/nemo/collections/tts/models/aligner.py b/nemo/collections/tts/models/aligner.py
index 72d023e9ee10..d8e65d6e6821 100644
--- a/nemo/collections/tts/models/aligner.py
+++ b/nemo/collections/tts/models/aligner.py
@@ -24,6 +24,7 @@
 from torch import nn
 
 from nemo.collections.tts.losses.aligner_loss import BinLoss, ForwardSumLoss
+from nemo.collections.tts.models.base import NeedsNormalizer
 from nemo.collections.tts.parts.utils.helpers import (
     binarize_attention,
     g2p_backward_compatible_support,
@@ -41,7 +42,7 @@
     HAVE_WANDB = False
 
 
-class AlignerModel(ModelPT):
+class AlignerModel(NeedsNormalizer, ModelPT):
     """Speech-to-text alignment model (https://arxiv.org/pdf/2108.10447.pdf) that is used to learn alignments between mel spectrogram and text."""
 
     def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
@@ -77,29 +78,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.bin_loss_start_ratio = cfg.bin_loss_start_ratio
         self.bin_loss_warmup_epochs = cfg.bin_loss_warmup_epochs
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer:
diff --git a/nemo/collections/tts/models/base.py b/nemo/collections/tts/models/base.py
index fe19ae75a3b3..b4b0ea9c43fa 100644
--- a/nemo/collections/tts/models/base.py
+++ b/nemo/collections/tts/models/base.py
@@ -18,6 +18,7 @@
 from typing import List, Optional
 
 import torch
+from hydra.utils import instantiate
 from omegaconf import DictConfig
 from tqdm import tqdm
 
@@ -28,9 +29,39 @@
 from nemo.core.neural_types.neural_type import NeuralType
 from nemo.utils import logging, model_utils
 
+PYNINI_AVAILABLE = True
+try:
+    import nemo_text_processing
+except (ImportError, ModuleNotFoundError):
+    PYNINI_AVAILABLE = False
 
-class SpectrogramGenerator(ModelPT, ABC):
-    """ Base class for all TTS models that turn text into a spectrogram """
+
+class NeedsNormalizer:
+    """Base class for all TTS models that needs text normalization(TN)"""
+
+    def _setup_normalizer(self, cfg):
+        if "text_normalizer" in cfg:
+            if not PYNINI_AVAILABLE:
+                logging.error(
+                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details."
+                )
+                logging.error("The normalizer will be disabled.")
+                return
+            normalizer_kwargs = {}
+
+            if "whitelist" in cfg.text_normalizer:
+                normalizer_kwargs["whitelist"] = self.register_artifact(
+                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
+                )
+
+            self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
+            self.text_normalizer_call = self.normalizer.normalize
+            if "text_normalizer_call_kwargs" in cfg:
+                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
+
+
+class SpectrogramGenerator(NeedsNormalizer, ModelPT, ABC):
+    """Base class for all TTS models that turn text into a spectrogram"""
 
     @abstractmethod
     def parse(self, str_input: str, **kwargs) -> 'torch.tensor':
@@ -115,7 +146,7 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
 
 
 class GlowVocoder(Vocoder):
-    """ Base class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected
+    """Base class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected
     to have a parameter called audio_to_melspec_precessor that is an instance of
     nemo.collections.asr.parts.FilterbankFeatures"""
 
@@ -175,7 +206,11 @@ def yet_another_patch(audio, n_fft, hop_length, win_length, window):
                 return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
 
             self.stft = lambda x: yet_another_patch(
-                x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window,
+                x,
+                n_fft=n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
             )
             self.istft = lambda x, y: torch.istft(
                 torch.complex(x * torch.cos(y), x * torch.sin(y)),
@@ -252,15 +287,15 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         return list_of_models
 
 
-class TextToWaveform(ModelPT, ABC):
-    """ Base class for all end-to-end TTS models that generate a waveform from text """
+class TextToWaveform(NeedsNormalizer, ModelPT, ABC):
+    """Base class for all end-to-end TTS models that generate a waveform from text"""
 
     @abstractmethod
     def parse(self, str_input: str, **kwargs) -> 'torch.tensor':
         """
-       A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2
-        dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor
-        should represent either tokenized or embedded text, depending on the model.
+        A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2
+         dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor
+         should represent either tokenized or embedded text, depending on the model.
         """
 
     @abstractmethod
@@ -299,7 +334,6 @@ def convert_graphemes_to_phonemes(
         num_workers: int = 0,
         pred_field: Optional[str] = "pred_text",
     ) -> List[str]:
-
         """
         Main function for Inference. Converts grapheme entries from the manifest "graheme_field" to phonemes
         Args:
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
index 3235a096a04b..b1e702c89124 100644
--- a/nemo/collections/tts/models/fastpitch.py
+++ b/nemo/collections/tts/models/fastpitch.py
@@ -200,28 +200,6 @@ def _get_default_text_tokenizer_conf(self):
         text_tokenizer: TextTokenizerConfig = TextTokenizerConfig()
         return OmegaConf.create(OmegaConf.to_yaml(text_tokenizer))
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
 
@@ -240,12 +218,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             # for backward compatability
@@ -478,16 +458,25 @@ def training_step(self, batch, batch_idx):
             )
             spec_predict = mels_pred[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
-                "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
+                "train_mel_predicted",
+                plot_spectrogram_to_numpy(spec_predict),
+                self.global_step,
+                dataformats="HWC",
             )
             if self.learn_alignment:
                 attn = attn_hard[0].data.cpu().float().numpy().squeeze()
                 self.tb_logger.add_image(
-                    "train_attn", plot_alignment_to_numpy(attn.T), self.global_step, dataformats="HWC",
+                    "train_attn",
+                    plot_alignment_to_numpy(attn.T),
+                    self.global_step,
+                    dataformats="HWC",
                 )
                 soft_attn = attn_soft[0].data.cpu().float().numpy().squeeze()
                 self.tb_logger.add_image(
-                    "train_soft_attn", plot_alignment_to_numpy(soft_attn.T), self.global_step, dataformats="HWC",
+                    "train_soft_attn",
+                    plot_alignment_to_numpy(soft_attn.T),
+                    self.global_step,
+                    dataformats="HWC",
                 )
 
         return loss
@@ -527,7 +516,20 @@ def validation_step(self, batch, batch_idx):
             )
 
         # Calculate val loss on ground truth durations to better align L2 loss in time
-        (mels_pred, _, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch, energy_pred, energy_tgt,) = self(
+        (
+            mels_pred,
+            _,
+            _,
+            log_durs_pred,
+            pitch_pred,
+            _,
+            _,
+            _,
+            attn_hard_dur,
+            pitch,
+            energy_pred,
+            energy_tgt,
+        ) = self(
             text=text,
             durs=durs,
             pitch=pitch,
@@ -587,7 +589,10 @@ def on_validation_epoch_end(self):
             )
             spec_predict = spec_predict[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
-                "val_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
+                "val_mel_predicted",
+                plot_spectrogram_to_numpy(spec_predict),
+                self.global_step,
+                dataformats="HWC",
             )
             self.log_train_images = True
         self.validation_step_outputs.clear()  # free memory)
@@ -598,7 +603,10 @@ def _setup_train_dataloader(self, cfg):
             phon_mode = self.vocab.set_phone_prob(self.vocab.phoneme_probability)
 
         with phon_mode:
-            dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,)
+            dataset = instantiate(
+                cfg.dataset,
+                text_tokenizer=self.vocab,
+            )
 
         sampler = dataset.get_sampler(cfg.dataloader_params.batch_size, world_size=self.trainer.world_size)
         return torch.utils.data.DataLoader(
@@ -611,7 +619,10 @@ def _setup_test_dataloader(self, cfg):
             phon_mode = self.vocab.set_phone_prob(0.0)
 
         with phon_mode:
-            dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,)
+            dataset = instantiate(
+                cfg.dataset,
+                text_tokenizer=self.vocab,
+            )
 
         return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params)
 
diff --git a/nemo/collections/tts/models/mixer_tts.py b/nemo/collections/tts/models/mixer_tts.py
index 1a44cd5b31c8..c260df22e3c0 100644
--- a/nemo/collections/tts/models/mixer_tts.py
+++ b/nemo/collections/tts/models/mixer_tts.py
@@ -123,29 +123,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.decoder = instantiate(cfg.decoder)
         self.proj = nn.Linear(self.decoder.d_model, cfg.n_mel_channels)
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer:
@@ -163,12 +140,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -269,7 +248,10 @@ def _metrics(
     def run_aligner(self, text, text_len, text_mask, spect, spect_len, attn_prior):
         text_emb = self.symbol_emb(text)
         attn_soft, attn_logprob = self.aligner(
-            spect, text_emb.permute(0, 2, 1), mask=text_mask == 0, attn_prior=attn_prior,
+            spect,
+            text_emb.permute(0, 2, 1),
+            mask=text_mask == 0,
+            attn_prior=attn_prior,
         )
         attn_hard = binarize_attention_parallel(attn_soft, text_len, spect_len)
         attn_hard_dur = attn_hard.sum(2)[:, 0, :]
@@ -444,7 +426,16 @@ def training_step(self, batch, batch_idx):
         pitch = (pitch - self.pitch_mean) / self.pitch_std
         pitch[zero_pitch_idx] = 0.0
 
-        (pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur,) = self(
+        (
+            pred_spect,
+            _,
+            pred_log_durs,
+            pred_pitch,
+            attn_soft,
+            attn_logprob,
+            attn_hard,
+            attn_hard_dur,
+        ) = self(
             text=text,
             text_len=text_len,
             pitch=pitch,
@@ -454,7 +445,17 @@ def training_step(self, batch, batch_idx):
             lm_tokens=lm_tokens,
         )
 
-        (loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss,) = self._metrics(
+        (
+            loss,
+            durs_loss,
+            acc,
+            acc_dist_1,
+            acc_dist_3,
+            pitch_loss,
+            mel_loss,
+            ctc_loss,
+            bin_loss,
+        ) = self._metrics(
             pred_durs=pred_log_durs,
             pred_pitch=pred_pitch,
             true_durs=attn_hard_dur,
@@ -496,7 +497,16 @@ def validation_step(self, batch, batch_idx):
         pitch = (pitch - self.pitch_mean) / self.pitch_std
         pitch[zero_pitch_idx] = 0.0
 
-        (pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur,) = self(
+        (
+            pred_spect,
+            _,
+            pred_log_durs,
+            pred_pitch,
+            attn_soft,
+            attn_logprob,
+            attn_hard,
+            attn_hard_dur,
+        ) = self(
             text=text,
             text_len=text_len,
             pitch=pitch,
@@ -506,7 +516,17 @@ def validation_step(self, batch, batch_idx):
             lm_tokens=lm_tokens,
         )
 
-        (loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss,) = self._metrics(
+        (
+            loss,
+            durs_loss,
+            acc,
+            acc_dist_1,
+            acc_dist_3,
+            pitch_loss,
+            mel_loss,
+            ctc_loss,
+            bin_loss,
+        ) = self._metrics(
             pred_durs=pred_log_durs,
             pred_pitch=pred_pitch,
             true_durs=attn_hard_dur,
@@ -605,7 +625,9 @@ def validation_step(self, batch, batch_idx):
             "raw_texts": [NeuralType(optional=True)],
             "lm_model": NeuralType(optional=True),
         },
-        output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),},
+        output_types={
+            "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),
+        },
     )
     def generate_spectrogram(
         self,
@@ -694,7 +716,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def setup_training_data(self, cfg):
@@ -749,7 +773,11 @@ def output_types(self):
 
     def input_example(self, max_text_len=10, max_lm_tokens_len=10):
         text = torch.randint(
-            low=0, high=len(self.tokenizer.tokens), size=(1, max_text_len), device=self.device, dtype=torch.long,
+            low=0,
+            high=len(self.tokenizer.tokens),
+            size=(1, max_text_len),
+            device=self.device,
+            dtype=torch.long,
         )
 
         inputs = {'text': text}
diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py
index 959720910f11..82f85d1ed6a2 100644
--- a/nemo/collections/tts/models/radtts.py
+++ b/nemo/collections/tts/models/radtts.py
@@ -296,7 +296,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def setup_training_data(self, cfg):
@@ -315,7 +317,9 @@ def setup_test_data(self, cfg):
             "speaker": NeuralType(('B'), Index(), optional=True),
             "sigma": NeuralType(optional=True),
         },
-        output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),},
+        output_types={
+            "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),
+        },
     )
     def generate_spectrogram(self, tokens: 'torch.tensor', speaker: int = 0, sigma: float = 1.0) -> torch.tensor:
         self.eval()
@@ -350,12 +354,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -374,29 +380,6 @@ def _setup_tokenizer(self, cfg):
             self.text_tokenizer_pad_id = text_tokenizer_pad_id
             self.tokens = tokens
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-                self.text_normalizer_call = self.normalizer.normalize
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def parse(self, text: str, normalize=False) -> torch.Tensor:
         if self.training:
             logging.warning("parse() is meant to be called in eval mode.")
@@ -479,7 +462,11 @@ def input_example(self, max_batch=1, max_dim=400):
         inp[inp == pad_id] = pad_id - 1 if pad_id > 0 else pad_id + 1
 
         inputs.update(
-            {'speaker_id': speaker, 'speaker_id_text': speaker, 'speaker_id_attributes': speaker,}
+            {
+                'speaker_id': speaker,
+                'speaker_id_text': speaker,
+                'speaker_id_attributes': speaker,
+            }
         )
         new_inputs = {
             'text': inp,
@@ -495,11 +482,24 @@ def input_example(self, max_batch=1, max_dim=400):
         return (new_inputs,)
 
     def forward_for_export(
-        self, text, batch_lengths, speaker_id, speaker_id_text, speaker_id_attributes, pitch, pace, volume,
+        self,
+        text,
+        batch_lengths,
+        speaker_id,
+        speaker_id_text,
+        speaker_id_attributes,
+        pitch,
+        pace,
+        volume,
     ):
         if self.export_config["enable_ragged_batches"]:
             text, pitch, pace, volume_tensor, lens = batch_from_ragged(
-                text, pitch, pace, batch_lengths=batch_lengths, padding_idx=self.tokenizer_pad, volume=volume,
+                text,
+                pitch,
+                pace,
+                batch_lengths=batch_lengths,
+                padding_idx=self.tokenizer_pad,
+                volume=volume,
             )
             if volume is not None:
                 volume = volume_tensor
diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py
index 3fcdee9832ef..2fb005d80ca6 100644
--- a/nemo/collections/tts/models/tacotron2.py
+++ b/nemo/collections/tts/models/tacotron2.py
@@ -322,29 +322,6 @@ def on_validation_epoch_end(self):
         self.log('val_loss', avg_loss)
         self.validation_step_outputs.clear()  # free memory
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None:
@@ -362,12 +339,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
diff --git a/nemo/collections/tts/models/vits.py b/nemo/collections/tts/models/vits.py
index 319221d04ee0..4a891fa8823e 100644
--- a/nemo/collections/tts/models/vits.py
+++ b/nemo/collections/tts/models/vits.py
@@ -92,28 +92,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         self.automatic_optimization = False
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-                self.text_normalizer_call = self.normalizer.normalize
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None:
@@ -131,12 +109,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -164,8 +144,14 @@ def configure_optimizers(self):
         sched_config = optim_config.pop("sched", None)
         OmegaConf.set_struct(optim_config, True)
 
-        optim_g = instantiate(optim_config, params=self.net_g.parameters(),)
-        optim_d = instantiate(optim_config, params=self.net_d.parameters(),)
+        optim_g = instantiate(
+            optim_config,
+            params=self.net_g.parameters(),
+        )
+        optim_d = instantiate(
+            optim_config,
+            params=self.net_d.parameters(),
+        )
 
         if sched_config is not None:
             if sched_config.name == 'ExponentialLR':
@@ -173,10 +159,14 @@ def configure_optimizers(self):
                 scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=sched_config.lr_decay)
             elif sched_config.name == 'CosineAnnealing':
                 scheduler_g = CosineAnnealing(
-                    optimizer=optim_g, max_steps=sched_config.max_steps, min_lr=sched_config.min_lr,
+                    optimizer=optim_g,
+                    max_steps=sched_config.max_steps,
+                    min_lr=sched_config.min_lr,
                 )
                 scheduler_d = CosineAnnealing(
-                    optimizer=optim_d, max_steps=sched_config.max_steps, min_lr=sched_config.min_lr,
+                    optimizer=optim_d,
+                    max_steps=sched_config.max_steps,
+                    min_lr=sched_config.min_lr,
                 )
             else:
                 raise ValueError("Unknown optimizer.")
@@ -362,7 +352,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def train_dataloader(self):
@@ -377,7 +369,10 @@ def train_dataloader(self):
         train_sampler = DistributedBucketSampler(dataset, **self.cfg.train_ds.batch_sampler)
 
         dataloader = torch.utils.data.DataLoader(
-            dataset, collate_fn=dataset.collate_fn, batch_sampler=train_sampler, **self.cfg.train_ds.dataloader_params,
+            dataset,
+            collate_fn=dataset.collate_fn,
+            batch_sampler=train_sampler,
+            **self.cfg.train_ds.dataloader_params,
         )
         return dataloader
 
@@ -412,7 +407,9 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         return list_of_models
 
     @typecheck(
-        input_types={"tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True),},
+        input_types={
+            "tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True),
+        },
         output_types={"audio": NeuralType(('B', 'T_audio'), AudioSignal())},
     )
     def convert_text_to_waveform(self, *, tokens, speakers=None):
diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py
index 2aeeae299a7d..2d0c00794c20 100644
--- a/nemo/collections/vlm/__init__.py
+++ b/nemo/collections/vlm/__init__.py
@@ -2,6 +2,7 @@
     DataConfig,
     ImageDataConfig,
     ImageToken,
+    LlavaNextTaskEncoder,
     MockDataModule,
     MultiModalToken,
     NevaLazyDataModule,
@@ -38,4 +39,5 @@
     "Llava1_5Config7B",
     "Llava1_5Config13B",
     "LlavaModel",
+    "LlavaNextTaskEncoder",
 ]
diff --git a/nemo/collections/vlm/neva/data/__init__.py b/nemo/collections/vlm/neva/data/__init__.py
index bbd502e21c80..9a02edce8530 100644
--- a/nemo/collections/vlm/neva/data/__init__.py
+++ b/nemo/collections/vlm/neva/data/__init__.py
@@ -14,6 +14,7 @@
 
 from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig, VideoDataConfig
 from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule
+from nemo.collections.vlm.neva.data.llava_next_energon import LlavaNextTaskEncoder
 from nemo.collections.vlm.neva.data.mock import MockDataModule
 from nemo.collections.vlm.neva.data.multimodal_tokens import ImageToken, MultiModalToken, VideoToken
 
@@ -26,4 +27,5 @@
     "MultiModalToken",
     "ImageToken",
     "VideoToken",
+    "LlavaNextTaskEncoder",
 ]
diff --git a/nemo/collections/vlm/neva/data/llava_next_energon.py b/nemo/collections/vlm/neva/data/llava_next_energon.py
new file mode 100644
index 000000000000..f50eedfb10a4
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/llava_next_energon.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Dict, List
+import torch
+from megatron.energon import VQASample, batch_list, batch_pad_stack
+from torch.nn.utils.rnn import pad_sequence
+from nemo.collections.multimodal.data.energon.config import ImageTextRawBatch, ImageTextSample, MultiModalSampleConfig
+from nemo.collections.multimodal.data.energon.sample_encoder import SampleEncoder, VQASampleEncoder
+from nemo.collections.multimodal.data.energon.task_encoder import MultiModalTaskEncoder
+from nemo.utils import logging
+
+
+class LlavaNextTextSample(ImageTextSample):
+    num_media_tiles: int = 0
+
+
+@dataclass
+class LlavaNextTextRawBatch(ImageTextRawBatch):
+    num_media_tiles: List[int] = field(default_factory=list)
+
+
+class LlavaNextSampleEncoder(VQASampleEncoder):
+    def __init__(self, tokenizer, image_processor, multimodal_sample_config=MultiModalSampleConfig()):
+        """
+        Initialize the VQASampleEncoder.
+
+        Parameters:
+        tokenizer (Tokenizer): The HF tokenizer used for processing text.
+        image_processor (ImageProcessor): The HF image processor used for preprocessing images.
+        multimodal_sample_config (MultiModalSampleConfig, optional): Configuration object for multimodal samples.
+            Defaults to MultiModalSampleConfig().
+        """
+        super().__init__(tokenizer, image_processor, multimodal_sample_config)
+
+    def process_image(self, image):
+        image_array = self.image_processor.preprocess(image, return_tensors='pt', do_rescale=False)['pixel_values'][0]
+        return image_array
+
+    def encode(self, input_sample: VQASample, output_sample: LlavaNextTextSample):
+        conversation_prompt = self.apply_prompt_template(input_sample)
+        logging.debug(f"task encoder encode_sample conversation_prompt {conversation_prompt}")
+        # tokenize prompt
+        tokens = self.tokenize(conversation_prompt)
+        labels = self.compute_labels(tokens, input_sample)
+
+        tokens = tokens[:-1].contiguous()
+        labels = labels[1:].contiguous()
+        logging.debug(f"task encoder encode_sample after tokenize prompt tokens {tokens}")
+        logging.debug(f"task encoder encode_sample lables {labels}")
+        loss_mask = self.compute_loss_mask(labels)
+        processed_image = self.process_image(input_sample.image)
+        output_sample.__key__ = input_sample.__key__
+        output_sample.images = processed_image
+        output_sample.tokens = tokens
+        output_sample.labels = labels
+        output_sample.loss_mask = loss_mask
+        output_sample.num_media_tiles = processed_image.shape[0]
+        return output_sample
+
+
+class LlavaNextTaskEncoder(MultiModalTaskEncoder):
+    def __init__(self, tokenizer, image_processor, multimodal_sample_config):
+        super().__init__(tokenizer, image_processor, multimodal_sample_config)
+        self.encoders: Dict[str, SampleEncoder] = {
+            VQASample.__name__: LlavaNextSampleEncoder(tokenizer, image_processor, multimodal_sample_config)
+        }
+
+    def batch(self, samples: List[LlavaNextTextSample]) -> LlavaNextTextRawBatch:
+        keys, images, tokens, labels, loss_mask, num_media_tiles = [], [], [], [], [], []
+        for sample in samples:
+            keys.append(sample.__key__)
+            images.append(sample.images)
+            tokens.append(sample.tokens)
+            labels.append(sample.labels)
+            loss_mask.append(sample.loss_mask)
+            num_media_tiles.append(sample.num_media_tiles)
+
+        batch_keys = batch_list(keys)
+
+        batch_images = torch.cat(images, dim=0)
+
+        batch_tokens = pad_sequence(tokens, batch_first=True)
+        batch_labels = pad_sequence(labels, batch_first=True)
+
+        batch_loss_mask = batch_pad_stack(loss_mask)
+        batch_num_media_tiles = torch.tensor(batch_list(num_media_tiles), dtype=torch.int)
+        return LlavaNextTextRawBatch(
+            __keys__=batch_keys,
+            images=batch_images,
+            tokens=batch_tokens,
+            labels=batch_labels,
+            loss_mask=batch_loss_mask,
+            num_media_tiles=batch_num_media_tiles,
+        )
diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py
index 7d0c53b79321..70e7b55c86d3 100644
--- a/nemo/collections/vlm/neva/model/base.py
+++ b/nemo/collections/vlm/neva/model/base.py
@@ -64,9 +64,7 @@ def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
 
     # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87
     # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L828-L842
-
     batch = next(dataloader_iter)
-
     _batch: dict
     if isinstance(batch, tuple) and len(batch) == 3:
         _batch = batch[0]
@@ -75,6 +73,7 @@ def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
 
     required_keys = set()
     required_keys.add("attention_mask")
+    required_keys.add("num_media_tiles")
     if parallel_state.is_pipeline_first_stage():
         required_keys.update(("media", "tokens", "position_ids"))
     if parallel_state.is_pipeline_last_stage():
@@ -98,6 +97,7 @@ def neva_forward_step(model, batch) -> torch.Tensor:
         "attention_mask": batch.get("attention_mask", None),
         "loss_mask": batch.get("loss_mask", None),
         "labels": batch.get("labels", None),
+        "num_media_tiles": batch.get("num_media_tiles", 1),
     }
 
     if 'cu_seqlens' in batch:
@@ -252,6 +252,7 @@ def configure_model(self, tokenizer) -> "MCoreLLaVAModel":
             loaded_state_dict = {k.removeprefix("module."): v for k, v in loaded_state_dict["state_dict"].items()}
             language_model.load_state_dict(loaded_state_dict)
             logging.info(f"Restored language model weights from {self.language_model_from_pretrained}")
+
         model = MCoreNevaModel(
             transformer_config=self,
             language_model=language_model,
@@ -530,6 +531,7 @@ def forward(
             output (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
             loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
         """
+
         use_inference_kv_cache = (
             inference_params is not None and "image_tokens_count" in inference_params.key_value_memory_dict
         )
@@ -588,7 +590,8 @@ def forward(
             # Assume 1 tile per image if the number of tiles is not provided.
             if num_media_tiles is None:
                 num_media_tiles = torch.ones(media.shape[0], dtype=torch.int, device=input_ids.device)
-
+            elif isinstance(num_media_tiles, list):
+                num_media_tiles = torch.tensor(num_media_tiles, dtype=torch.int, device=input_ids.device)
             # Preprocess input, labels and loss mask.
             combined_embeddings, final_labels, final_loss_mask = self._preprocess_data(
                 media_embeddings,
@@ -647,6 +650,7 @@ def forward(
         media: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         inference_params: InferenceParams = None,
+        num_media_tiles: Optional[List[int]] = None,
     ) -> torch.Tensor:
         output_tensor = self.module(
             media=media,
@@ -656,6 +660,7 @@ def forward(
             attention_mask=attention_mask,
             labels=labels,
             inference_params=inference_params,
+            num_media_tiles=num_media_tiles,
         )
 
         return output_tensor
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index a7107974fbaa..fb43224d59a9 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -176,6 +176,7 @@ def export(
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
+        use_mcore_path: bool = False,
         reduce_fusion: bool = True,
         fp8_quantized: Optional[bool] = None,
         fp8_kvcache: Optional[bool] = None,
@@ -213,11 +214,11 @@ def export(
             multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
             gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
             gemm_plugin (str): enable the gpt plugin. Default = "auto"
+            use_mcore_path (bool) : Use the more recent mcore path for export
             reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce
             fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type.
             fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type.
         """
-
         if n_gpus is not None:
             warnings.warn(
                 "Parameter n_gpus is deprecated and will be removed in the next release. "
@@ -326,53 +327,169 @@ def export(
                         "Supported model types are: {1}.".format(model_type, self.get_supported_models_list)
                     )
 
-                if model_type == "gpt" or model_type == "starcoder":
-                    model_type = "gptnext"
+                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+                if use_mcore_path:
+                    from megatron.core.export.data_type import DataType
+                    from megatron.core.export.export_config import ExportConfig
+                    from megatron.core.export.model_type import ModelType
+                    from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
+                        DEFAULT_CONVERSION_DICT,
+                    )
+                    from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+                    from megatron.core.transformer.transformer_config import TransformerConfig
+                    from tensorrt_llm.layers import MoeConfig
+
+                    def get_transformer_config(nemo_model_config):
+                        normalization = nemo_model_config.get('normalization', 'layernorm')
+                        transformer_config_normalization = 'LayerNorm'
+                        layernorm_zero_centered_gamma = False
+                        if normalization == 'layernorm1p':
+                            layernorm_zero_centered_gamma = True
+                        elif normalization == 'rmsnorm':
+                            transformer_config_normalization = 'RMSNorm'
+
+                        conf = TransformerConfig(
+                            num_layers=nemo_model_config.get('num_layers'),
+                            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
+                            num_attention_heads=nemo_model_config.get('num_attention_heads'),
+                            num_query_groups=nemo_model_config.get(
+                                'num_query_groups', nemo_model_config['num_attention_heads']
+                            ),
+                            kv_channels=nemo_model_config.get("kv_channels", None),
+                            hidden_size=nemo_model_config.get('hidden_size'),
+                            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
+                            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
+                            add_bias_linear=nemo_model_config.get('bias'),
+                            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
+                            normalization=transformer_config_normalization,
+                            layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
+                        )
 
-                if model_type == "mixtral":
-                    model_type = "llama"
+                        return conf
+
+                    # We build the transformer config using the nemo model config.
+                    transformer_config = get_transformer_config(model_configs)
+                    input_model_type = getattr(ModelType, model_type)
+
+                    # MCore export supports some default conversion dictionaries
+                    mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
+                    # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
+                    nemo_model_conversion_dict = {
+                        f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
+                    }
+
+                    trtllm_helper = TRTLLMHelper(
+                        transformer_config=transformer_config,
+                        model_type=input_model_type,
+                        trtllm_conversion_dict=nemo_model_conversion_dict,
+                        position_embedding_type=model_configs.get('position_embedding_type'),
+                        max_position_embeddings=model_configs.get('max_position_embeddings'),
+                        rotary_percentage=model_configs.get('rotary_percentage', 1.0),
+                        rotary_base=model_configs.get('rotary_base', 10000),
+                        moe_tp_mode=model_configs.get('moe_tp_mode', 2),
+                        multi_query_mode=model_configs.get("multi_query_mode", False),
+                        activation=model_configs.get('activation', "gelu"),
+                        seq_len_interpolation_factor=model_configs.get("seq_len_interpolation_factor"),
+                        moe_renorm_mode=model_configs.get(
+                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                        ),
+                        share_embeddings_and_output_weights=model_configs.get(
+                            "share_embeddings_and_output_weights", False
+                        ),
+                    )
 
-                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
-                weights_dicts, model_configs = model_to_trtllm_ckpt(
-                    model=model,
-                    nemo_model_config=model_configs,
-                    nemo_export_dir=nemo_export_dir,
-                    decoder_type=model_type,
-                    dtype=dtype,
-                    tensor_parallel_size=tensor_parallelism_size,
-                    pipeline_parallel_size=pipeline_parallelism_size,
-                    gpus_per_node=gpus_per_node,
-                    use_parallel_embedding=use_parallel_embedding,
-                    use_embedding_sharing=use_embedding_sharing,
-                    fp8_quantized=fp8_quantized,
-                    fp8_kvcache=fp8_kvcache,
-                )
+                    input_dtype = getattr(DataType, dtype)
+                    export_config = ExportConfig(
+                        tensor_parallelism_size,
+                        pipeline_parallelism_size,
+                        use_parallel_embedding,
+                        use_embedding_sharing,
+                    )
 
-                for weight_dict, model_config in zip(weights_dicts, model_configs):
-                    build_and_save_engine(
-                        max_input_len=max_input_len,
-                        max_output_len=max_output_len,
-                        max_batch_size=max_batch_size,
-                        model_config=model_config,
-                        model_weights=weight_dict,
-                        model_dir=self.model_dir,
-                        model_type=model_type,
-                        lora_ckpt_list=self.lora_ckpt_list,
-                        use_lora_plugin=use_lora_plugin,
-                        max_lora_rank=max_lora_rank,
-                        lora_target_modules=lora_target_modules,
-                        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                        paged_kv_cache=paged_kv_cache,
-                        remove_input_padding=remove_input_padding,
-                        paged_context_fmha=paged_context_fmha,
-                        max_num_tokens=max_num_tokens,
-                        opt_num_tokens=opt_num_tokens,
-                        max_seq_len=max_seq_len,
-                        multiple_profiles=multiple_profiles,
-                        gpt_attention_plugin=gpt_attention_plugin,
-                        gemm_plugin=gemm_plugin,
+                    trtllm_model_weights_list, trtllm_model_config_list = (
+                        trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                            model_state_dict=model,
+                            export_config=export_config,
+                            dtype=input_dtype,
+                            state_dict_split_by_layer_numbers=False,
+                        )
+                    )
+
+                    for trtllm_model_weights, trtllm_model_config in zip(
+                        trtllm_model_weights_list, trtllm_model_config_list
+                    ):
+                        trtllm_helper.build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            engine_dir=self.model_dir,
+                            trtllm_model_weights=trtllm_model_weights,
+                            trtllm_model_config=trtllm_model_config,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            use_refit=False,
+                            max_num_tokens=max_num_tokens,
+                            max_seq_len=max_seq_len,
+                            opt_num_tokens=opt_num_tokens,
+                            max_beam_width=1,
+                            tokens_per_block=128,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
+                else:
+                    if model_type == "gpt" or model_type == "starcoder":
+                        model_type = "gptnext"
+
+                    if model_type == "mixtral":
+                        model_type = "llama"
+
+                    weights_dicts, model_configs = model_to_trtllm_ckpt(
+                        model=model,
+                        nemo_model_config=model_configs,
+                        nemo_export_dir=nemo_export_dir,
+                        decoder_type=model_type,
+                        dtype=dtype,
+                        tensor_parallel_size=tensor_parallelism_size,
+                        pipeline_parallel_size=pipeline_parallelism_size,
+                        gpus_per_node=gpus_per_node,
+                        use_parallel_embedding=use_parallel_embedding,
+                        use_embedding_sharing=use_embedding_sharing,
+                        fp8_quantized=fp8_quantized,
+                        fp8_kvcache=fp8_kvcache,
                     )
 
+                    for weight_dict, model_config in zip(weights_dicts, model_configs):
+                        build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            model_config=model_config,
+                            model_weights=weight_dict,
+                            model_dir=self.model_dir,
+                            model_type=model_type,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            max_num_tokens=max_num_tokens,
+                            opt_num_tokens=opt_num_tokens,
+                            max_seq_len=max_seq_len,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
+
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
             tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context")
             if os.path.exists(tokenizer_path):
@@ -454,7 +571,6 @@ def convert_to_safe_tensors(
                     weight_dict[k] = numpy_to_torch(v)
 
                 safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
-
             model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
index 366206c948eb..e5e9f8154d24 100755
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -260,9 +260,7 @@ def model_to_trtllm_ckpt(
 
         if mapping.is_first_pp_rank():
             embedding_weight = (
-                np.ascontiguousarray(
-                    split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
-                )
+                split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
                 if use_parallel_embedding
                 else weights_dict["transformer.vocab_embedding.weight"]
             )
@@ -272,9 +270,7 @@ def model_to_trtllm_ckpt(
             pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight")
             if pos_embedding_weight is not None:
                 if use_parallel_embedding:
-                    pos_embedding_weight = np.ascontiguousarray(
-                        split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank)
-                    )
+                    pos_embedding_weight = split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank)
                 weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight
 
         if mapping.is_last_pp_rank():
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index abcd4556602f..ba4847219ed3 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -22,6 +22,8 @@
 import torch
 from torch import nn
 
+from nemo.lightning.megatron_init import initialize_model_parallel_for_nemo
+
 NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE = "NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE"
 
 
@@ -57,7 +59,6 @@ def init_parallel_ranks(
         seed (int, optional): The seed for random number generation. Defaults to 1234.
         fp8 (bool, optional): Whether to use fp8 precision for model parameters. Defaults to False.
     """
-    from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo
     from nemo.utils import AppState
 
     app_state = AppState()
@@ -169,17 +170,20 @@ def set_model_parallel_attributes(model, parallelism):
 
 @contextmanager
 def megatron_lazy_init_context(config) -> Generator[None, None, None]:
-    from megatron.core.extensions import transformer_engine as _te
+    try:
+        from megatron.core.extensions import transformer_engine as _te
 
-    original = _te._get_extra_te_kwargs  # noqa: SLF001
+        original = _te._get_extra_te_kwargs  # noqa: SLF001
 
-    def _get_extra_te_kwargs_meta(c):
-        """Forces device to meta"""
-        kwargs = original(c)
-        kwargs['device'] = 'meta'
-        return kwargs
+        def _get_extra_te_kwargs_meta(c):
+            """Forces device to meta"""
+            kwargs = original(c)
+            kwargs['device'] = 'meta'
+            return kwargs
 
-    _te._get_extra_te_kwargs = _get_extra_te_kwargs_meta  # noqa: SLF001
+        _te._get_extra_te_kwargs = _get_extra_te_kwargs_meta  # noqa: SLF001
+    except ImportError:
+        pass
 
     _orig_perform_initialization = config.perform_initialization
     _orig_use_cpu_initialization = config.use_cpu_initialization
@@ -189,7 +193,13 @@ def _get_extra_te_kwargs_meta(c):
 
     yield
 
-    _te._get_extra_te_kwargs = original  # noqa: SLF001
+    try:
+        from megatron.core.extensions import transformer_engine as _te
+
+        _te._get_extra_te_kwargs = original  # noqa: SLF001
+    except ImportError:
+        pass
+
     config.perform_initialization = _orig_perform_initialization
     config.use_cpu_initialization = _orig_use_cpu_initialization
 
diff --git a/nemo/lightning/ckpt_utils.py b/nemo/lightning/ckpt_utils.py
index a532d1335bae..a29c5f3cdbc4 100644
--- a/nemo/lightning/ckpt_utils.py
+++ b/nemo/lightning/ckpt_utils.py
@@ -3,7 +3,6 @@
 
 # NeMo2 checkpoint structure is a checkpoint directory, with a WEIGHTS_PATH and CONTEXT_PATH subdirectory structure.
 #  WEIGHTS_PATH stores the weights while CONTEXT_PATH stores the hyper-parameters.
-WEIGHTS_PATH: str = "weights"
 CONTEXT_PATH: str = "context"
 
 
@@ -18,12 +17,6 @@ def idempotent_path_append(base_dir: Union[str, Path], suffix) -> Path:
     return base_dir
 
 
-def ckpt_to_weights_subdir(filepath: Union[str, Path]) -> Path:
-    """Given an input checkpoint filepath, clean it using `ckpt_to_dir` and then return the weights subdirectory."""
-    base_dir = ckpt_to_dir(filepath=filepath)
-    return idempotent_path_append(base_dir, WEIGHTS_PATH)
-
-
 def ckpt_to_context_subdir(filepath: Union[str, Path]) -> Path:
     """Given an input checkpoint filepath, clean it using `ckpt_to_dir` and then return the context subdirectory."""
     base_dir = ckpt_to_dir(filepath=filepath)
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 0f30dfe22851..ea7d91b37214 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -139,6 +139,8 @@ def add_megatron_sampler(
     dataloader_type: Literal["single", "cyclic", "batch"] = "single",
     drop_last: bool = True,
     pad_samples_to_global_batch_size: bool = False,
+    rank: int = 0,
+    world_size: int = 1,
     # data_sharding: bool = False
 ) -> DataLoader:
     """
@@ -172,9 +174,6 @@ def add_megatron_sampler(
     Returns:
         DataLoader: A new DataLoader instance with the configured Megatron sampler.
     """
-
-    from megatron.core import parallel_state
-
     if dataloader_type == 'single':
         batch_sampler = MegatronPretrainingSampler(
             total_samples=len(dataloader.dataset),
@@ -182,8 +181,8 @@ def add_megatron_sampler(
             micro_batch_size=micro_batch_size,
             global_batch_size=global_batch_size,
             rampup_batch_size=rampup_batch_size,
-            data_parallel_rank=parallel_state.get_data_parallel_rank(),
-            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            data_parallel_rank=rank,
+            data_parallel_size=world_size,
             drop_last=drop_last,
             pad_samples_to_global_batch_size=pad_samples_to_global_batch_size,
         )
@@ -192,8 +191,8 @@ def add_megatron_sampler(
             total_samples=len(dataloader.dataset),
             consumed_samples=consumed_samples,
             micro_batch_size=micro_batch_size,
-            data_parallel_rank=parallel_state.get_data_parallel_rank(),
-            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            data_parallel_rank=rank,
+            data_parallel_size=world_size,
             drop_last=drop_last,
             # data_sharding=data_sharding
         )
@@ -207,8 +206,8 @@ def add_megatron_sampler(
             consumed_samples=consumed_samples,
             micro_batch_size=micro_batch_size,
             global_batch_size=global_batch_size,
-            data_parallel_rank=parallel_state.get_data_parallel_rank(),
-            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            data_parallel_rank=rank,
+            data_parallel_size=world_size,
             drop_last=drop_last,
             pad_samples_to_global_batch_size=not drop_last,
         )
diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py
index 643b671d1d85..7a702edb7f21 100644
--- a/nemo/lightning/io/api.py
+++ b/nemo/lightning/io/api.py
@@ -1,5 +1,6 @@
 from pathlib import Path
-from typing import Callable, Optional, Type
+from typing import Callable, Optional, Type, overload
+import fiddle as fdl
 
 import pytorch_lightning as pl
 
@@ -7,14 +8,23 @@
 from nemo.lightning.io.pl import TrainerContext
 
 
-def load_context(path: Path, subpath: Optional[str] = None) -> TrainerContext:
+@overload
+def load_context(path: Path, subpath: Optional[str] = None, build: bool = True) -> TrainerContext: ...
+
+
+@overload
+def load_context(path: Path, subpath: Optional[str] = None, build: bool = False) -> fdl.Config[TrainerContext]: ...
+
+
+def load_context(path: Path, subpath: Optional[str] = None, build: bool = True):
     """
     Loads a TrainerContext from a json-file or directory.
 
     Args:
         path (Path): The path to the json-file or directory containing 'io.json'.
         subpath (Optional[str]): Subpath to selectively load only specific objects inside the TrainerContext. Defaults to None.
-
+        build (bool): Whether to build the TrainerContext. Defaults to True.
+            Otherwise, the TrainerContext is returned as a Config[TrainerContext] object.
     Returns
     -------
         TrainerContext: The loaded TrainerContext instance.
@@ -27,7 +37,7 @@ def load_context(path: Path, subpath: Optional[str] = None) -> TrainerContext:
         checkpoint: TrainerContext = load_ckpt("/path/to/checkpoint", subpath="model.config")
 
     """
-    return load(path, output_type=TrainerContext, subpath=subpath)
+    return load(path, output_type=TrainerContext, subpath=subpath, build=build)
 
 
 def model_importer(target: Type[ConnectorMixin], ext: str) -> Callable[[Type[ConnT]], Type[ConnT]]:
diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py
index ec451de9753b..7d2d608c4149 100644
--- a/nemo/lightning/io/artifact/base.py
+++ b/nemo/lightning/io/artifact/base.py
@@ -6,10 +6,10 @@
 
 
 class Artifact(ABC, Generic[ValueT]):
-    def __init__(self, attr: str, required: bool = True):
+    def __init__(self, attr: str, required: bool = True, skip: bool = False):
         self.attr = attr
         self.required = required
-        self.skip = False
+        self.skip = skip
 
     @abstractmethod
     def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT:
@@ -18,3 +18,6 @@ def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT:
     @abstractmethod
     def load(self, path: Path) -> ValueT:
         pass
+
+    def __repr__(self):
+        return f"{type(self).__name__}(skip= {self.skip}, attr= {self.attr}, required= {self.required})"
diff --git a/nemo/lightning/io/artifact/file.py b/nemo/lightning/io/artifact/file.py
index 1364468cde0a..1cd63b706c9a 100644
--- a/nemo/lightning/io/artifact/file.py
+++ b/nemo/lightning/io/artifact/file.py
@@ -2,6 +2,7 @@
 import shutil
 from pathlib import Path
 from typing import Union
+import fiddle as fdl
 
 from nemo.lightning.io.artifact.base import Artifact
 
@@ -19,8 +20,7 @@ class FileArtifact(Artifact[str]):
     def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
         if not pathize(value).exists():
             # This is Artifact is just a string.
-            self.skip = True
-            return value
+            return fdl.Config(FileArtifact, attr=value, skip=True)
         new_value = copy_file(value, absolute_dir, relative_dir)
         return str(new_value)
 
@@ -65,8 +65,7 @@ class DirOrStringArtifact(DirArtifact):
     def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
         if not pathize(value).exists():
             # This is Artifact is just a string.
-            self.skip = True
-            return value
+            return fdl.Config(DirOrStringArtifact, attr=value, skip=True)
         return super().dump(value, absolute_dir, relative_dir)
 
     def load(self, path: str) -> str:
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 38fbda42c67d..ede6ab4fb234 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -8,7 +8,7 @@
 from filelock import FileLock, Timeout
 from pytorch_lightning.trainer.states import TrainerFn
 
-from nemo.lightning.ckpt_utils import ckpt_to_context_subdir, ckpt_to_weights_subdir
+from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 
 # Dynamically inherit from the correct Path subclass based on the operating system.
 if os.name == 'nt':
@@ -134,7 +134,9 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]):
             Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer.
     """
 
-    def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
+    def nemo_setup(
+        self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None, *args, **kwargs
+    ) -> pl.Trainer:
         """
         Sets up the model and trainer using a specified strategy, preparing it for training or inference.
 
@@ -150,7 +152,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
         _trainer = trainer or Trainer(
             devices=1,
             accelerator="cpu",
-            strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True),
+            strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True, *args, **kwargs),
         )
         # Note: set trainer to fitting state to avoid the following code path. Feel free to refactor if we no longer
         #  need to avoid this:
@@ -182,7 +184,9 @@ def nemo_save(self, output_path: Path, trainer: pl.Trainer, dump_io: bool = True
         trainer.strategy.setup(trainer)
         output_path = Path(output_path)
         output_path.mkdir(parents=True, exist_ok=True)
-        trainer.save_checkpoint(ckpt_to_weights_subdir(output_path))
+        trainer.save_checkpoint(output_path)
+        if getattr(trainer.strategy, "async_save", False):
+            trainer.strategy.checkpoint_io.maybe_finalize_save_checkpoint(blocking=True)
 
         from nemo.lightning.io.pl import TrainerContext
         from nemo.utils.get_rank import is_global_rank_zero
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index aa74e2cf174c..5d1738e348b1 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -584,8 +584,12 @@ def _io_path_elements_fn(x):
 def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         # Allow optional artifacts
-        if artifact.skip:
+        if artifact.skip or (not hasattr(cfg, artifact.attr) and not artifact.required):
             continue
+
+        if not hasattr(cfg, artifact.attr) and artifact.required:
+            raise ValueError(f"Artifact '{artifact.attr}' is required but not provided")
+
         current_val = getattr(cfg, artifact.attr)
         if current_val is None:
             if artifact.required:
@@ -605,6 +609,15 @@ def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: P
 
 def _artifact_transform_load(cfg: fdl.Config, path: Path):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
+        # We expect an artifact.attr to be a string or a fdl.Config.
+        # Some parameteres can be a string or a filepath. When those parameters are just strings,
+        # we will represent it with a fdl.Config, and will skip the rest of the loop (base-dir adjustment).
+        current_val = getattr(cfg, artifact.attr)
+        if isinstance(current_val, fdl.Config):
+            # artifact.attr is a string not a path.
+            setattr(cfg, artifact.attr, fdl.build(current_val).attr)
+            continue
+
         if artifact.skip:
             continue
         current_val = getattr(cfg, artifact.attr)
@@ -623,7 +636,7 @@ def _artifact_transform_load(cfg: fdl.Config, path: Path):
             pass
 
 
-def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = None) -> CkptType:
+def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = None, build: bool = True) -> CkptType:
     """
     Loads a configuration from a pickle file and constructs an object of the specified type.
 
@@ -687,4 +700,7 @@ def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] =
     config = serialization.Deserialization(json_config).result
     _artifact_transform_load(config, path)
 
+    if not build:
+        return config
+
     return fdl.build(config)
diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py
index 51c47ad94dbb..ab18c0cada49 100644
--- a/nemo/lightning/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -27,6 +27,7 @@
 from nemo.lightning.io.capture import IOProtocol
 from nemo.lightning.io.mixin import IOMixin
 
+
 try:
     from nemo.utils.callbacks.dist_ckpt_io import AsyncCompatibleCheckpointIO
 except ImportError:
@@ -39,6 +40,10 @@
 LightningModuleT = TypeVar("LightningModuleT", bound=pl.LightningModule)
 ModuleT = TypeVar("ModuleT", bound=nn.Module)
 
+# NeMo2 checkpoint structure is a checkpoint directory, with a WEIGHTS_PATH and CONTEXT_PATH subdirectory structure.
+#  WEIGHTS_PATH stores the weights while CONTEXT_PATH stores the hyper-parameters.
+WEIGHTS_PATH: str = "weights"
+
 
 @dataclass
 class TrainerContext(IOMixin, Generic[LightningModuleT]):
@@ -64,6 +69,26 @@ def construct_extra(cls, trainer: pl.Trainer) -> Dict[str, Any]:
         return extra
 
 
+def ckpt_to_weights_subdir(filepath: Union[str, Path], is_saving) -> Path:
+    """Given an input checkpoint filepath, clean it using `ckpt_to_dir` and then return the weights subdirectory, if it exists."""
+    filepath = ckpt_to_dir(filepath=filepath)
+    base_dir = filepath
+    assert isinstance(base_dir, Path)
+    if base_dir.parts[-1] != WEIGHTS_PATH:
+        maybe_base_dir = base_dir / WEIGHTS_PATH
+        if maybe_base_dir.is_dir() or is_saving:
+            base_dir = maybe_base_dir
+    ## handle adapter paths
+    if hasattr(base_dir, "base_model_path") and base_dir.base_model_path.parts[-1] != WEIGHTS_PATH:
+        maybe_base_model_path = base_dir.base_model_path / WEIGHTS_PATH
+        if maybe_base_model_path.is_dir() or is_saving:
+            base_dir.base_model_path = base_dir.base_model_path / WEIGHTS_PATH
+    if is_saving:
+        assert base_dir.parts[-1] == WEIGHTS_PATH
+        assert base_dir.parent == Path(filepath)
+    return base_dir
+
+
 class MegatronCheckpointIO(AsyncCompatibleCheckpointIO, IOMixin):
     """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively,
     common for most use cases.
@@ -118,7 +143,8 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
                 f" storage_options, but {storage_options=} was provided."
                 f" Ignoring given storage_options"
             )
-        checkpoint_dir = ckpt_to_dir(path)
+        checkpoint_dir = ckpt_to_weights_subdir(path, is_saving=True)
+
         fs = get_filesystem(checkpoint_dir)
         if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
             logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
@@ -174,6 +200,11 @@ def load_checkpoint(
         if not fs.isdir(path):
             raise ValueError(f"Distributed checkpoints should be a directory. Found: {path}.")
 
+        # Load from ckpt_path/weights (new format) if it exists
+        path = ckpt_to_weights_subdir(path, is_saving=False)
+        if hasattr(path, "base_model_path") and not path.base_model_path.exists():
+            path.base_model_path = path.base_model_path.parent
+
         if self.save_ckpt_format == 'zarr' and self.load_directly_on_device:
             from megatron.core.dist_checkpointing.strategies.tensorstore import TensorStoreLoadShardedStrategy
 
diff --git a/nemo/lightning/megatron_init.py b/nemo/lightning/megatron_init.py
new file mode 100644
index 000000000000..c060d140cb8c
--- /dev/null
+++ b/nemo/lightning/megatron_init.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+import torch
+
+from nemo.utils import AppState, logging
+
+try:
+    from apex.transformer.log_util import set_logging_level
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_APEX = False
+
+try:
+    from megatron.core import tensor_parallel
+    from megatron.core.parallel_state import (
+        RankGenerator,
+        get_pipeline_model_parallel_rank,
+        set_expert_model_parallel_rank,
+        set_expert_model_parallel_world_size,
+        set_pipeline_model_parallel_rank,
+        set_pipeline_model_parallel_split_rank,
+        set_pipeline_model_parallel_world_size,
+        set_tensor_model_parallel_rank,
+        set_tensor_model_parallel_world_size,
+        set_virtual_pipeline_model_parallel_rank,
+    )
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+try:
+    from megatron.core.num_microbatches_calculator import (
+        ConstantNumMicroBatchesCalculator,
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+        init_num_microbatches_calculator,
+    )
+
+    MCORE_MB_CALCULATOR = True
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator
+    from apex.transformer.pipeline_parallel.utils import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+    )
+    from apex.transformer.pipeline_parallel.utils import (
+        setup_microbatch_calculator as init_num_microbatches_calculator,
+    )
+
+    MCORE_MB_CALCULATOR = False
+
+
+try:
+    from megatron.core.parallel_state import set_virtual_pipeline_model_parallel_world_size
+
+    HAVE_INTERLEAVED = True
+
+except:
+
+    HAVE_INTERLEAVED = False
+
+
+def initialize_model_parallel_for_nemo(
+    world_size,
+    global_rank,
+    local_rank,
+    tensor_model_parallel_size=1,
+    expert_model_parallel_size=1,
+    pipeline_model_parallel_size=1,
+    virtual_pipeline_model_parallel_size=None,
+    pipeline_model_parallel_split_rank=None,
+    context_parallel_size=1,
+    micro_batch_size=None,
+    global_batch_size=None,
+    rampup_batch_size=None,
+    use_fp8=False,
+    init_mpi_proc_group=False,
+    seed=1234,
+    apex_transformer_log_level=30,
+    use_tp_pp_dp_mapping=False,
+    use_te_rng_tracker=False,
+):
+
+    if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED:
+        raise ValueError("set_virtual_pipeline_model_parallel_world_size is needed in megatron-core for interleaved.")
+
+    # updating NeMo globals
+    app_state = AppState()
+    app_state.global_rank = global_rank
+    app_state.world_size = world_size
+    app_state.local_rank = local_rank
+    app_state.use_tp_pp_dp_mapping = use_tp_pp_dp_mapping
+    app_state.expert_model_parallel_size = expert_model_parallel_size
+    app_state.tensor_model_parallel_size = tensor_model_parallel_size
+    app_state.pipeline_model_parallel_size = pipeline_model_parallel_size
+    app_state.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
+    app_state.context_parallel_size = context_parallel_size
+    app_state.use_fp8 = use_fp8
+    app_state.init_mpi_proc_group = init_mpi_proc_group
+    (
+        app_state.tensor_model_parallel_rank,
+        app_state.pipeline_model_parallel_rank,
+        app_state.expert_model_parallel_rank,
+        app_state.model_parallel_size,
+        app_state.data_parallel_size,
+        app_state.pipeline_model_parallel_split_rank,
+        app_state.virtual_pipeline_model_parallel_rank,
+    ) = fake_initialize_model_parallel(
+        world_size=world_size,
+        rank=global_rank,
+        tensor_model_parallel_size_=tensor_model_parallel_size,
+        pipeline_model_parallel_size_=pipeline_model_parallel_size,
+        virtual_pipeline_model_parallel_size_=virtual_pipeline_model_parallel_size,
+        pipeline_model_parallel_split_rank_=pipeline_model_parallel_split_rank,
+        context_parallel_size_=context_parallel_size,
+        expert_model_parallel_size_=expert_model_parallel_size,
+        use_tp_pp_dp_mapping=use_tp_pp_dp_mapping,
+    )
+
+    # update apex.transformer globals
+    set_tensor_model_parallel_world_size(app_state.tensor_model_parallel_size)
+    set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank)
+
+    set_expert_model_parallel_world_size(app_state.expert_model_parallel_size)
+    set_expert_model_parallel_rank(app_state.expert_model_parallel_rank)
+
+    set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank)
+    if HAVE_INTERLEAVED:
+        set_virtual_pipeline_model_parallel_world_size(app_state.virtual_pipeline_model_parallel_size)
+    set_virtual_pipeline_model_parallel_rank(app_state.virtual_pipeline_model_parallel_rank)
+    set_pipeline_model_parallel_world_size(app_state.pipeline_model_parallel_size)
+    set_pipeline_model_parallel_split_rank(app_state.pipeline_model_parallel_split_rank)
+
+    tensor_parallel.random.initialize_rng_tracker(use_te_rng_tracker=use_te_rng_tracker)
+    if seed is not None:
+        # @chcui not setting seed is for model conversion. always set seed for training/inference.
+        _set_random_seed(seed)
+
+    if global_batch_size and micro_batch_size is not None:
+        # TODO: add rampup_batch_size here when we have it implemented
+        if MCORE_MB_CALCULATOR:
+            from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+            if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+                init_num_microbatches_calculator(
+                    rank=global_rank,
+                    global_batch_size=global_batch_size,
+                    micro_batch_size=micro_batch_size,
+                    data_parallel_size=app_state.data_parallel_size,
+                    rampup_batch_size=rampup_batch_size,
+                )
+            else:
+                if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
+                    assert get_current_global_batch_size() == global_batch_size
+                    assert get_micro_batch_size() == micro_batch_size
+                    assert get_num_microbatches() == global_batch_size // (
+                        micro_batch_size * app_state.data_parallel_size
+                    )
+                else:
+                    raise Exception("Microbatch calculator already initialized.")
+        else:
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+            if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+                init_num_microbatches_calculator(
+                    rank=global_rank,
+                    global_batch_size=global_batch_size,
+                    micro_batch_size=micro_batch_size,
+                    data_parallel_size=app_state.data_parallel_size,
+                    rampup_batch_size=rampup_batch_size,
+                )
+            else:
+                if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
+                    assert get_current_global_batch_size() == global_batch_size
+                    assert get_micro_batch_size() == micro_batch_size
+                    assert get_num_microbatches() == global_batch_size // (
+                        micro_batch_size * app_state.data_parallel_size
+                    )
+                else:
+                    raise Exception("Microbatch calculator already initialized.")
+
+    app_state._is_megatron_initialized = True
+
+    if HAVE_APEX:
+        set_logging_level(apex_transformer_log_level)
+
+
+def _set_random_seed(seed_):
+    """Set random seed for reproducability."""
+    if seed_ is not None and seed_ > 0:
+        # Ensure that different pipeline MP stages get different seeds.
+        seed = seed_ + (100 * get_pipeline_model_parallel_rank())
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.device_count() > 0:
+            tensor_parallel.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError('Seed ({}) should be a positive integer.'.format(seed_))
+
+
+def set_jit_fusion_options():
+    """Set PyTorch JIT layer fusion options."""
+    # set flags if we are using the 21.10 container
+    if torch.__version__ == "1.10.0a0+0aef44c":
+        # nvfuser
+        torch._C._jit_set_profiling_executor(True)
+        torch._C._jit_set_profiling_mode(True)
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(True)
+        torch._C._debug_set_autodiff_subgraph_inlining(False)
+
+
+def fake_initialize_model_parallel(
+    world_size,
+    rank,
+    tensor_model_parallel_size_,
+    pipeline_model_parallel_size_,
+    pipeline_model_parallel_split_rank_=None,
+    virtual_pipeline_model_parallel_size_=None,
+    expert_model_parallel_size_=1,
+    context_parallel_size_=1,
+    use_tp_pp_dp_mapping=False,
+):
+    """
+    Fake initialize model data parallel groups so that we can instantiate model parallel models before DDP is initialized.
+    This is needed because PTL execution flow is init model, init trainer -> call trainer.fit(model). DDP is initialized during .fit.
+    This function is taken from megatron.core.parallel_state and modified so that the distributed groups are not created.
+    We only need the tensor parallel and pipeline parallel ranks to instantiate the model.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used to parallelize model tensor.
+        pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline.
+        context_parallel_size: number of GPUs used to parallelize tokens of each input.
+
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+
+    # Get world size and rank. Ensure some consistencies.
+    tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size)
+    pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size)
+    model_parallel_size = tensor_model_parallel_size * pipeline_model_parallel_size
+    context_parallel_size = min(context_parallel_size_, world_size)
+
+    assert (
+        world_size % (tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size) == 0
+    ), f'world_size: {world_size} must be divisible by tensor_model_parallel_size: {tensor_model_parallel_size} times pipeline_model_parallel_size {pipeline_model_parallel_size} times context_parallel_size {context_parallel_size}'
+    data_parallel_size = world_size // (
+        tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size
+    )
+
+    num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
+
+    virtual_pipeline_model_parallel_rank = None
+    if virtual_pipeline_model_parallel_size_ is not None:
+        virtual_pipeline_model_parallel_rank = 0
+
+    rank_generator = RankGenerator(
+        tp=tensor_model_parallel_size,
+        ep=expert_model_parallel_size_,
+        dp=data_parallel_size,
+        pp=pipeline_model_parallel_size,
+        cp=context_parallel_size,
+        order='tp-pp-dp' if use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp',
+    )
+
+    # Build the data-parallel groups.
+    all_data_parallel_group_ranks_with_cp = []
+    for ranks in rank_generator.get_ranks('dp'):
+        if rank in ranks:
+            data_parallel_group = list(ranks)
+            logging.info(f'Rank {rank} has data parallel group : {data_parallel_group}')
+
+    for ranks_with_cp in rank_generator.get_ranks('dp-cp'):
+        all_data_parallel_group_ranks_with_cp.append(ranks_with_cp)
+        if rank in ranks_with_cp:
+            data_parallel_group_with_cp = ranks_with_cp
+            logging.info(
+                f'Rank {rank} has combined group of data parallel and context parallel : {data_parallel_group_with_cp}'
+            )
+
+    data_parallel_rank = data_parallel_group.index(rank)
+    logging.info(
+        f'All data parallel group ranks with context parallel combined: {all_data_parallel_group_ranks_with_cp}'
+    )
+    logging.info(f'Ranks {rank} has data parallel rank: {data_parallel_rank}')
+
+    # Build the context-parallel groups.
+    all_context_parallel_group_ranks = []
+    for ranks in rank_generator.get_ranks('cp'):
+        all_context_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            context_parallel_group = ranks
+            logging.info(f'Rank {rank} has context parallel group: {context_parallel_group}')
+
+    context_parallel_rank = context_parallel_group.index(rank)
+    logging.info(f'All context parallel group ranks: {all_context_parallel_group_ranks}')
+    logging.info(f'Ranks {rank} has context parallel rank: {context_parallel_rank}')
+
+    # Build the model-parallel groups.
+    all_model_parallel_group_ranks = []
+    for ranks in rank_generator.get_ranks('tp-pp'):
+        all_model_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            logging.info(f'Rank {rank} has model parallel group: {list(ranks)}')
+    logging.info(f'All model parallel group ranks: {all_model_parallel_group_ranks}')
+
+    # Build the tensor model-parallel groups.
+    all_tensor_model_parallel_group_ranks = []
+    tensor_model_parallel_group = None
+    for ranks in rank_generator.get_ranks('tp'):
+        all_tensor_model_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            tensor_model_parallel_group = ranks
+            logging.info(f'Rank {rank} has tensor model parallel group: {tensor_model_parallel_group}')
+
+    tensor_model_parallel_rank = tensor_model_parallel_group.index(rank)
+
+    logging.info(f'All tensor model parallel group ranks: {all_tensor_model_parallel_group_ranks}')
+    logging.info(f'Rank {rank} has tensor model parallel rank: {tensor_model_parallel_rank}')
+
+    # EP rank
+    expert_model_parallel_rank = 0
+    if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1:
+        for ranks in rank_generator.get_ranks('ep', independent_ep=True):
+            if rank in ranks:
+                expert_model_parallel_rank = list(ranks).index(rank)
+
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    all_pipeline_model_parallel_group_ranks = []
+    all_embedding_group_ranks = []
+    pipeline_model_parallel_group = None
+    embedding_group = None
+    embedding_rank = None
+    for ranks in rank_generator.get_ranks('pp'):
+        all_pipeline_model_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            pipeline_model_parallel_group = ranks
+            logging.info(f'Rank {rank} has pipeline model parallel group: {pipeline_model_parallel_group}')
+
+        # Setup embedding group (to exchange gradients between
+        # first and last stages).
+        if len(ranks) > 1:
+            embedding_ranks = [ranks[0], ranks[-1]]
+            all_embedding_group_ranks.append(embedding_ranks)
+        else:
+            embedding_ranks = ranks
+            all_embedding_group_ranks.append(list(embedding_ranks))
+        if rank in embedding_ranks:
+            embedding_group = list(embedding_ranks)
+            logging.info(f'Rank {rank} has embedding group: {embedding_group}')
+
+    pipeline_model_parallel_rank = pipeline_model_parallel_group.index(rank)
+    if embedding_group is not None:
+        embedding_rank = embedding_group.index(rank)
+
+    logging.info(f'All pipeline model parallel group ranks: {all_pipeline_model_parallel_group_ranks}')
+    logging.info(f'Rank {rank} has pipeline model parallel rank {pipeline_model_parallel_rank}')
+    logging.info(f'All embedding group ranks: {all_pipeline_model_parallel_group_ranks}')
+    logging.info(f'Rank {rank} has embedding rank: {embedding_rank}')
+
+    return (
+        tensor_model_parallel_rank,
+        pipeline_model_parallel_rank,
+        expert_model_parallel_rank,
+        model_parallel_size,
+        data_parallel_size,
+        pipeline_model_parallel_split_rank_,
+        virtual_pipeline_model_parallel_rank,
+    )
diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index 5244939eb5fb..bac04439eb51 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -49,7 +49,7 @@ class ModelCheckpoint(PTLModelCheckpoint):
             ``every_n_epochs`` or ``every_n_train_steps``.
         save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch
         save_optim_on_train_end: Whether to include the optimizer states in the final checkpoint
-            at the end of training. Only applicable when save_weights_only is ``True``.
+            at the end of training. Only applicable when save_weights_only is ``False``.
         always_save_context: Whether to dump the artifacts needed to reinintialize the current
             model, trainer, and dataloader to allow for reproducibility of experiments.
         save_context_on_train_end: Whether to dump the artifacts on_train_end regardless of whether
@@ -58,7 +58,6 @@ class ModelCheckpoint(PTLModelCheckpoint):
     """
 
     UNFINISHED_CHECKPOINT_SUFFIX = "-unfinished"
-    WEIGHTS_PATH = "weights"
 
     def __init__(
         self,
@@ -73,7 +72,7 @@ def __init__(
         train_time_interval: Optional[timedelta] = None,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
         save_optim_on_train_end: Optional[bool] = False,
-        always_save_context: bool = False,
+        always_save_context: bool = True,
         save_context_on_train_end: bool = True,
         **kwargs,
     ):
@@ -436,7 +435,6 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
 
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete.
-        ckpt_filepath = ckpt_to_dir(filepath) / ModelCheckpoint.WEIGHTS_PATH
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
         ema_callback = self._ema_callback(trainer)
 
@@ -453,15 +451,15 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
             if self.async_save:
                 raise ValueError('async_save with EMA not supported')
             with ema_callback.save_original_optimizer_state(trainer):
-                super()._save_checkpoint(trainer, ckpt_filepath)
+                super()._save_checkpoint(trainer, filepath)
 
             # save EMA copy of the model as well.
             with ema_callback.save_ema_model(trainer):
-                rank_zero_info(f"Saving EMA weights to separate checkpoint {ckpt_filepath}")
-                ckpt_filepath = self._ema_format_filepath(ckpt_filepath)
+                rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
+                filepath = self._ema_format_filepath(filepath)
                 if self.verbose:
-                    rank_zero_info(f"Saving EMA weights to separate checkpoint {ckpt_filepath}")
-                super()._save_checkpoint(trainer, ckpt_filepath)
+                    rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
+                super()._save_checkpoint(trainer, filepath)
             self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
         else:
             ## Determine whether to include optimizer states in the checkpoint
@@ -487,7 +485,7 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
                 self.deferred_ckpts_to_remove.append([])
             else:
                 storage_options = None
-            trainer.save_checkpoint(ckpt_filepath, save_weights_only, storage_options=storage_options)
+            trainer.save_checkpoint(filepath, save_weights_only, storage_options=storage_options)
 
             if self.always_save_context and is_global_rank_zero():
                 TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath) / "context", yaml_attrs=["model"])
@@ -596,11 +594,11 @@ def _remove_unfinished_checkpoints(checkpoint_dir: Union[Path, str]) -> None:
         }
 
         checkpoint_filepaths = {f.resolve() for f in checkpoint_dir.rglob("*.ckpt")}
-        for ckpt_filepath in checkpoint_filepaths:
-            possible_marker_path = ModelCheckpoint.format_checkpoint_unfinished_marker_path(ckpt_filepath)
+        for filepath in checkpoint_filepaths:
+            possible_marker_path = ModelCheckpoint.format_checkpoint_unfinished_marker_path(filepath)
             if possible_marker_path in existing_marker_filepaths:
-                logging.warning(f'Removing unfinished checkpoint: {ckpt_filepath}')
-                os.remove(ckpt_filepath)
+                logging.warning(f'Removing unfinished checkpoint: {filepath}')
+                os.remove(filepath)
 
         # some directories might be distributed checkpoints, we remove these if they have a unfinished marker
         all_dirpaths = {d.resolve() for d in checkpoint_dir.glob("*") if d.is_dir()}
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 1e3cde0bbcde..429035d8b1c5 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -14,6 +14,7 @@
 
 import json
 from abc import ABC, abstractmethod
+from functools import partial
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
 
@@ -24,9 +25,10 @@
 from pytorch_lightning.trainer.states import TrainerFn
 from typing_extensions import override
 
-from nemo.lightning.io.pl import ckpt_to_dir
+from nemo.lightning.io.pl import ckpt_to_dir, ckpt_to_weights_subdir
 from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
 from nemo.utils import logging
+from nemo.utils.callbacks.dist_ckpt_io import AsyncCompatibleCheckpointIO
 
 if TYPE_CHECKING:
     from megatron.core.dist_checkpointing.mapping import ShardedStateDict
@@ -90,18 +92,49 @@ def __call__(self, model: nn.Module) -> nn.Module:
         Returns:
             nn.Module: The transformed model with PEFT applied.
         """
-
-        model.freeze()
+        self.freeze_model(model)
         model.walk(self.transform)
 
         return model
 
+    def freeze_model(self, model: nn.Module) -> None:
+        """Apply a default freeze method to the model.
+
+        This method freezes all the model parameters. This method can be overridden by subclasses to
+        implement custom freeze strategies (e.g. freeze only parts of the model)
+
+        Args:
+            model (nn.Module): The model to be fine-tuned.
+
+        Returns:
+            nn.Module: The transformed model with PEFT applied.
+        """
+        model.freeze()
+        model.train(mode=True)
+
     def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None:
+        from nemo.lightning.pytorch.strategies.utils import create_checkpoint_io
+
         super().setup(trainer, pl_module, stage=stage)
 
         trainer.strategy.trainer = trainer
-        self.wrapped_io = WrappedAdapterIO(trainer.strategy.checkpoint_io, self)
-        trainer.strategy._checkpoint_io = self.wrapped_io
+        wrapped_io = partial(WrappedAdapterIO, peft=self)
+        ckpt_io_kwargs = {
+            "save_ckpt_format": trainer.strategy.save_ckpt_format,
+            "async_save": trainer.strategy.async_save,
+            "torch_dist_multiproc": trainer.strategy.torch_dist_multiproc,
+            "assume_constant_structure": trainer.strategy.assume_constant_structure,
+            "parallel_save": trainer.strategy.parallel_save,
+            "parallel_save_within_dp": trainer.strategy.parallel_save_within_dp,
+            "parallel_load": trainer.strategy.parallel_load,
+            "load_directly_on_device": trainer.strategy.load_directly_on_device,
+        }
+        trainer.strategy._checkpoint_io = create_checkpoint_io(wrapping_ckpt_io=wrapped_io, **ckpt_io_kwargs)
+        self.wrapped_io = (
+            trainer.strategy._checkpoint_io._checkpoint_io
+            if trainer.strategy.async_save
+            else trainer.strategy._checkpoint_io
+        )
         trainer.strategy._init_model_parallel = False
         trainer.strategy._setup_optimizers = False
 
@@ -257,7 +290,7 @@ def load_state_dict(self, state_dict, strict=True):
             self.adapter.load_state_dict(adapter_state_dict, strict)
 
 
-class WrappedAdapterIO(_WrappingCheckpointIO):
+class WrappedAdapterIO(_WrappingCheckpointIO, AsyncCompatibleCheckpointIO):
     peft: Optional[PEFT] = None
     model_ckpt_path: Optional[Path] = None
     adapter_ckpt_path: Optional[Path] = None
@@ -273,15 +306,16 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         checkpoint['sharded_state_dict'] = dict(
             filter(lambda item: self.peft.adapter_key_filter(item[0]), checkpoint['sharded_state_dict'].items())
         )
-        self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options=storage_options)
+        request = self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options=storage_options)
 
         from nemo.utils.get_rank import is_global_rank_zero
 
         if is_global_rank_zero():
             metadata = {"model_ckpt_path": str(self.model_ckpt_path)}
-            adapter_meta_path = ckpt_to_dir(path) / _ADAPTER_META_FILENAME
+            adapter_meta_path = ckpt_to_weights_subdir(path, is_saving=True) / _ADAPTER_META_FILENAME
             with open(adapter_meta_path, "w") as f:
                 json.dump(metadata, f)
+        return request
 
     @override
     def load_checkpoint(
diff --git a/nemo/lightning/pytorch/optim/__init__.py b/nemo/lightning/pytorch/optim/__init__.py
index 1572e95e136a..db40e5c48c1b 100644
--- a/nemo/lightning/pytorch/optim/__init__.py
+++ b/nemo/lightning/pytorch/optim/__init__.py
@@ -28,6 +28,7 @@
     WarmupPolicyScheduler,
 )
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+from nemo.lightning.pytorch.optim.pytorch import PytorchOptimizerModule
 
 __all__ = [
     "OptimizerModule",
@@ -45,4 +46,5 @@
     "PolynomialDecayAnnealingScheduler",
     "PolynomialHoldDecayAnnealingScheduler",
     "CosineAnnealingScheduler",
+    "PytorchOptimizerModule",
 ]
diff --git a/nemo/lightning/pytorch/optim/pytorch.py b/nemo/lightning/pytorch/optim/pytorch.py
new file mode 100644
index 000000000000..6600fc0cf0a4
--- /dev/null
+++ b/nemo/lightning/pytorch/optim/pytorch.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional
+
+import pytorch_lightning as pl
+from torch.optim import Optimizer
+
+from nemo.lightning.megatron_parallel import MegatronParallel
+from nemo.lightning.pytorch.optim.base import LRSchedulerModule, OptimizerModule
+
+
+def _param_does_not_have_wd(param_name, param):
+    return 'bias' in param_name
+
+
+class PytorchOptimizerModule(OptimizerModule):
+    """A OptimizerModule for pytorch optimizers.
+
+    Attributes:
+        config (OptimizerConfig): Configuration for the optimizer.
+        no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
+        scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
+        lr_mult (float): Learning rate multiplier.
+
+    Example::
+
+        config = OptimizerConfig(...)
+        lr_scheduler = MyLRSchedulerModule(...)
+        optimizer_module = PytorchOptimizerModule(config, lr_scheduler)
+
+    Methods:
+        setup(model): Sets up the optimizer.
+        optimizers(model): Defines the optimizers.
+    """
+
+    def __init__(
+        self,
+        optim_cls,
+        config: dict = {'lr': 3e-4},
+        lr_scheduler: Optional[LRSchedulerModule] = None,
+        no_weight_decay_cond: Optional[Callable] = _param_does_not_have_wd,
+        scale_lr_cond: Optional[Callable] = None,
+        lr_mult: float = 1.0,
+    ):
+        """Initializes the PytorchOptimizerModule.
+
+        Args:
+            config (OptimizerConfig): Configuration for the optimizer.
+            lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.
+            no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
+            scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
+            lr_mult (float): Learning rate multiplier.
+        """
+
+        super().__init__(lr_scheduler=lr_scheduler)
+        self.optim_cls = optim_cls
+        self.config = config
+        self.no_weight_decay_cond = no_weight_decay_cond
+        self.scale_lr_cond = scale_lr_cond
+        self.lr_mult = lr_mult
+        self.optim_cls = optim_cls
+
+    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
+        # Noop
+        pass
+
+    def optimizers(self, model) -> List[Optimizer]:
+        """Defines the optimizers.
+
+        Args:
+            model (nn.Module): The model for which the optimizers are being defined.
+
+        Returns:
+            List[Optimizer]: The list of optimizers.
+
+        Raises:
+            ValueError: If the model is an instance of MegatronParallel.
+        """
+
+        if isinstance(model, MegatronParallel):
+            raise ValueError("Model cannot be an instance of MegatronParallel")
+
+        params_with_wd, params_without_wd = [], []
+        if self.no_weight_decay_cond is not None:
+            for name, param in model.named_parameters():
+                if self.no_weight_decay_cond(name, param):
+                    params_without_wd.append(param)
+                else:
+                    params_with_wd.append(param)
+        else:
+            params_with_wd = model.parameters()
+
+        optimizers = []
+        if len(params_with_wd) > 0:
+            optimizers.append(
+                self.optim_cls(
+                    params_with_wd,
+                    **self.config,
+                )
+            )
+
+        if len(params_without_wd) > 0:
+            wd = self.config.get('weight_decay', None)
+            kwargs['weight_decay'] = 0
+            optimizers.append(
+                self.optim_cls(
+                    params_without_wd,
+                    **kwargs,
+                )
+            )
+            # restore value
+            if wd is not None:
+                kwargs['weight_decay'] = wd
+
+        assert len(optimizers) > 0, "Expected at least one optimizer with params"
+        return optimizers
+
+    def finalize_model_grads(self, *args, **kwargs):
+        # Noop
+        pass
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 55bafce5f71e..52ba9e3220ac 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -65,9 +65,14 @@ def setup(self, global_rank: int) -> None:
         setup_microbatch_calculator(global_rank, self.micro_batch_size, self.global_batch_size, self.rampup_batch_size)
 
     def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0) -> DataLoader:
+        from megatron.core import parallel_state
+
         from nemo.lightning.data import add_megatron_sampler
 
         mode = getattr(dataloader, 'mode', 'train')
+
+        data_parallel_rank = parallel_state.get_data_parallel_rank()
+        data_parallel_size = parallel_state.get_data_parallel_world_size()
         return add_megatron_sampler(
             dataloader,
             micro_batch_size=self.micro_batch_size,
@@ -76,6 +81,8 @@ def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0
             consumed_samples=self.init_consumed_samples if mode == 'train' else 0,
             dataloader_type=self.dataloader_type,
             drop_last=self.drop_last,
+            rank=data_parallel_rank,
+            world_size=data_parallel_size,
         )
 
     def compute_consumed_samples(self, steps_since_resume=0) -> int:
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index c5195511c522..c515ab2207a4 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -57,7 +57,6 @@
 
 from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 from nemo.lightning import _strategy_lib, io
-from nemo.lightning.ckpt_utils import ckpt_to_weights_subdir
 from nemo.lightning.megatron_parallel import (
     CallbackConnector,
     MegatronParallel,
@@ -135,7 +134,7 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
         save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving. Should be one of
             'torch_dist' or 'zarr'. Defaults to 'torch_dist'.
         ckpt_async_save (bool): Whether to save checkpoints asynchronously to reduce checkpointing overhead.
-            Defaults to False.
+            Defaults to True.
         ckpt_torch_dist_multiproc (int): Number of extra processes per rank used during ckpt save
             with PyTorch distributed format. Defaults to None.
         ckpt_assume_constant_structure (bool): Allows caching some computation across checkpoint saves.
@@ -145,7 +144,7 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
         ckpt_parallel_save_within_dp (bool): If true, save will be parallelized only within a DP group
             (whole world otherwise), which might slightly reduce the save overhead. Defaults to False.
         ckpt_parallel_load (bool): If true, each worker will load part of the dist checkpoint
-            and exchange with NCCL. Might use some extra GPU memory. Defaults to False.
+            and exchange with NCCL. Might use some extra GPU memory. Defaults to True.
         ckpt_parallel_save_optim (bool): Parallel save/load of a DistributedOptimizer. 'True'
             allows performant save and reshardable checkpoints. Set to 'False' only in order to minimize
             the number of checkpoint files.
@@ -191,12 +190,12 @@ def __init__(
         lazy_init: bool = False,
         pipeline_dtype: Optional[torch.dtype] = None,
         save_ckpt_format: str = "torch_dist",
-        ckpt_async_save: bool = False,
+        ckpt_async_save: bool = True,
         ckpt_torch_dist_multiproc: int = None,  ## TODO(ashors): put elsewhere?
         ckpt_assume_constant_structure: bool = False,
         ckpt_parallel_save: bool = True,
         ckpt_parallel_save_within_dp: bool = False,
-        ckpt_parallel_load: bool = False,
+        ckpt_parallel_load: bool = True,
         ckpt_parallel_save_optim: bool = True,
         ckpt_load_directly_on_device: bool = True,
         setup_optimizers: bool = True,
@@ -267,6 +266,8 @@ def __init__(
     def connect(self, model: pl.LightningModule) -> None:
         super().connect(model)
 
+        assert not 'is_hf_model' in model.__dict__, "Cannot use HfAutoModelForCausalLM with MegatronParallel"
+
         _maybe_mcore_config = _strategy_lib.set_model_parallel_attributes(model, self.parallelism)
         if _maybe_mcore_config:
             self._mcore_config = _maybe_mcore_config
@@ -694,13 +695,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path], selective_restore:
             if self.lightning_module.optimizers(use_pl_optimizer=False):
                 sharded_state_dict["optimizer"] = [self.optimizer_sharded_state_dict(is_loading=True)]
 
-        # Load from ckpt_path/weights (new format) if it exists, otherwise load from ckpt_path (legacy format)
-        load_dir = ckpt_to_weights_subdir(checkpoint_path)
-        if not load_dir.exists():
-            load_dir = checkpoint_path
-        if isinstance(load_dir, AdapterPath) and not load_dir.base_model_path.exists():
-            load_dir.base_model_path = load_dir.base_model_path.parent
-        checkpoint = self.checkpoint_io.load_checkpoint(load_dir, sharded_state_dict=sharded_state_dict)
+        checkpoint = self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=sharded_state_dict)
 
         return checkpoint
 
diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py
index 415392f2bef0..150fc14726ec 100644
--- a/nemo/lightning/pytorch/strategies/utils.py
+++ b/nemo/lightning/pytorch/strategies/utils.py
@@ -127,8 +127,10 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     return filepath
 
 
-def create_checkpoint_io(**kwargs):
+def create_checkpoint_io(wrapping_ckpt_io=None, **kwargs):
     checkpoint_io = MegatronCheckpointIO(**kwargs)
+    if wrapping_ckpt_io:
+        checkpoint_io = wrapping_ckpt_io(checkpoint_io)
     if kwargs.get("async_save", False):
         checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io)
 
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 45905729b8b1..c9a38c5979ca 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -52,14 +52,14 @@ class PreemptionPlugin(run.Plugin):
         preempt_time (int): The time, in seconds, before the task's time limit at which the executor
                              will send a SIGTERM preemption signal. This allows tasks to be gracefully
                              stopped before reaching their time limit, reducing waste and
-                             promoting fair resource usage. The default value is 300 seconds (5 minutes).
+                             promoting fair resource usage. The default value is 60 seconds (1 minute).
                              This is only supported for ``run.SlurmExecutor``.
         callbacks (list[run.Config[Callback]]): A list of callback configurations that the plugin
                                                 will merge with the task's existing callbacks.
                                                 By default, the list includes NeMo's preemption callback.
     """
 
-    preempt_time: int = 300
+    preempt_time: int = 60
     callbacks: list[run.Config[Callback]] = field(default_factory=lambda: [run.Config(PreemptionCallback)])
 
     def setup(self, task: run.Partial | run.Script, executor: run.Executor):
@@ -287,16 +287,16 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
             tp_size = task.trainer.strategy.tensor_model_parallel_size
             cp_size = task.trainer.strategy.context_parallel_size
             if tp_size > 1 and cp_size > 1:
-                executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = 1
+                executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 
             # Set LayerNorm SM margin to support the overlap with LayerNorm kernel
             if self.enable_layernorm_sm_margin:
-                executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = self.layernorm_sm_margin
-                executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = self.layernorm_sm_margin
+                executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
+                executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
 
         # Force Transformer Engine to use cuDNN attention over HazyResearch's Flash Attention
-        executor.env_vars["NVTE_FLASH_ATTN"] = 0
-        executor.env_vars["NVTE_FUSED_ATTN"] = 1
+        executor.env_vars["NVTE_FLASH_ATTN"] = "0"
+        executor.env_vars["NVTE_FUSED_ATTN"] = "1"
 
         # Improve perf by steering power to tensor cores, may not work on all systems
         if self.enable_vboost and isinstance(executor, run.SlurmExecutor):
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 3d4b7189f56e..b512bc57cbab 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -1169,6 +1169,20 @@ def configure_checkpointing(
         params.filename = f'{name}--{{{params.monitor}:.4f}}-{{epoch}}'
     if params.prefix is None:
         params.prefix = name
+    if params.always_save_nemo:
+        app_state = AppState()
+        if (
+            (app_state.tensor_model_parallel_size is not None and app_state.tensor_model_parallel_size > 1)
+            or (app_state.pipeline_model_parallel_size is not None and app_state.pipeline_model_parallel_size > 1)
+            or (app_state.context_parallel_size is not None and app_state.context_parallel_size > 1)
+        ):
+            raise LoggerMisconfigurationError(
+                "always_save_nemo is set to True, please ensure that model parallel is not used."
+                f"tensor_model_parallel_size: {app_state.tensor_model_parallel_size},"
+                f"pipeline_model_parallel_size: {app_state.pipeline_model_parallel_size},"
+                f"context_parallel_size: {app_state.context_parallel_size},"
+            )
+
     NeMoModelCheckpoint.CHECKPOINT_NAME_LAST = params.filename + '-last'
 
     logging.debug(params.dirpath)
diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py
index 95e17e5c5f6c..bcc7ad199603 100644
--- a/nemo/utils/nemo_logging.py
+++ b/nemo/utils/nemo_logging.py
@@ -76,7 +76,7 @@ def __init__(self, capture_warnings=True):
         self.rank = 0 if is_global_rank_zero() else "UNK"
 
     def _define_logger(self, capture_warnings=True):
-        """ Creates the logger if not already created. Called in init"""
+        """Creates the logger if not already created. Called in init"""
 
         # Use double-checked locking to avoid taking lock unnecessarily.
         if self._logger is not None:
@@ -126,7 +126,7 @@ def record_factory(*args, **kwargs):
         self._logger.propagate = False
 
     def remove_stream_handlers(self):
-        """ Removes StreamHandler that log to stdout and stderr from the logger."""
+        """Removes StreamHandler that log to stdout and stderr from the logger."""
         if self._logger is None:
             raise RuntimeError("Impossible to set handlers if the Logger is not predefined")
 
@@ -236,7 +236,7 @@ def set_verbosity(self, verbosity_level):
 
     @contextmanager
     def patch_stderr_handler(self, stream):
-        """ Sends messages that should log to stderr to stream instead. Useful for unittests """
+        """Sends messages that should log to stderr to stream instead. Useful for unittests"""
         if self._logger is not None:
             try:
                 old_stream = self._handlers["stream_stderr"].stream
@@ -268,7 +268,7 @@ def patch_stderr_handler(self, stream):
 
     @contextmanager
     def patch_stdout_handler(self, stream):
-        """ Sends messages that should log to stdout to stream instead. Useful for unittests """
+        """Sends messages that should log to stdout to stream instead. Useful for unittests"""
         if self._logger is not None:
             try:
                 old_stream = self._handlers["stream_stdout"].stream
@@ -339,6 +339,16 @@ def captureWarnings(self, capture):
                 warnings.showwarning = self.old_warnings_showwarning
                 self.old_warnings_showwarning = None
 
+    def _warning_is_ignored(self, category):
+        from warnings import filters
+
+        # Search the filters
+        for action, msg, cat, mod, ln in filters:
+            # least-common demoninator if multiple filters for the same class.
+            if cat == category and action == 'ignore':
+                return True
+        return False
+
     def _showwarning(self, message, category, filename, lineno, file=None, line=None):
         """
         Implementation of showwarnings which redirects to logging.
@@ -346,6 +356,8 @@ def _showwarning(self, message, category, filename, lineno, file=None, line=None
         with level logging.WARNING.
         """
         s = warnings.formatwarning(message, category, filename, lineno, line)
+        if self._warning_is_ignored(category):
+            return
         self.warning("%s", s)
 
     def _logged_once(self, msg, mode):
diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt
index 616381ed5933..d8ad52452c7c 100644
--- a/requirements/requirements_common.txt
+++ b/requirements/requirements_common.txt
@@ -1,4 +1,5 @@
 datasets
+einops
 inflect
 pandas
 sacremoses>=0.0.43
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 171abce41f37..e8020f244821 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
 pytorch-lightning>2.2.1
 torchmetrics>=0.11.0
-transformers>=4.44.0
+transformers>=4.45.0
 wandb
 webdataset>=0.2.86
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index 8b56c3974a25..18abe82c9f96 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -1,6 +1,6 @@
 addict
 clip
-decord
+decord; sys_platform == 'linux'
 diffusers>=0.19.3
 einops_exts
 imageio
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 3d168ad3b12a..16b6c574d2fa 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -1,6 +1,5 @@
 accelerated-scan
 boto3
-einops
 faiss-cpu
 fasttext
 flask_restful
@@ -9,7 +8,7 @@ gdown
 h5py
 ijson
 jieba
-mamba-ssm==2.2.2
+mamba-ssm==2.2.2; sys_platform == 'linux'
 markdown2
 matplotlib>=3.3.2
 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
index f395e34765d0..42d3e77ce4c8 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
@@ -15,7 +15,7 @@
 r"""
 Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_llama_hf_to_nemo.py \
+    python convert_llama_hf_to_nemo_load.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --input_state_dict <path_to_saved_state_dict> \
      --output_path <path_to_output_nemo_file> \
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
index 940a9df5f9a8..f7096996e5b1 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
@@ -15,7 +15,7 @@
 r"""
 Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_llama_hf_to_nemo.py \
+    python convert_llama_hf_to_nemo_save_dict.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --output_path <path_to_output_nemo_file>
      --precision bf16 
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index ba9012de01a8..796819c38ba4 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -65,6 +65,7 @@ def load_config(hf_model_name, nemo_config):
         logging.warning(f"Got unknown activation function {nemo_config.activation}")
 
     hf_config.rope_theta = nemo_config['rotary_base']
+    hf_config.tie_word_embeddings = getattr(nemo_config, "share_embeddings_and_output_weights", False)
     return hf_config
 
 
@@ -213,7 +214,13 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
         output_layer_base_name = 'model.output_layer.weight'
     else:
         output_layer_base_name = 'model.language_model.output_layer.weight'
-    state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name])
+
+    if getattr(nemo_config, "share_embeddings_and_output_weights", False):
+        # tie_word_embeddings: True
+        state_dict[hf_output_layer_weight_name] = state_dict[embed_weights_base_name]
+    else:
+        # tie_word_embeddings: False
+        state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name])
     return state_dict, nemo_config, dtype
 
 
diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
index 29b56aa706fa..eeaee9aba461 100644
--- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
+++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
@@ -16,14 +16,13 @@
 Conversion script to convert zarr checkpoints into torch distributed checkpoint.
   Example to run this conversion script:
     python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
-     megatron_zarr_ckpt_to_torch_dist.py \
+     convert_zarr_to_torch_dist.py \
      --model_type <model_type> \
      --checkpoint_folder <path_to_PTL_checkpoints_folder> \
      --checkpoint_name <checkpoint_name> \
      --path_to_save <path_to_output_ckpt_files> \
      --tensor_model_parallel_size <tensor_model_parallel_size> \
      --pipeline_model_parallel_size <pipeline_model_parallel_size> \
-     --hparams_file <path_to_model_yaml_config> \
      --gpus_per_node <gpus_per_node>
 """
 
@@ -64,12 +63,14 @@ def get_args():
         "--hparams_file",
         type=str,
         default=None,
-        required=True,
+        required=False,
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
     parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
     parser.add_argument(
-        "--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
+        "--save_to_nemo",
+        action="store_true",
+        help="If passed, output will be written as .nemo file.",
     )
     parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
     parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
@@ -81,7 +82,7 @@ def get_args():
         default=None,
         help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
     )
-    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+    parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
     parser.add_argument("--cluster_type", required=False, default=None, help="Whether on BCP platform")
     parser.add_argument(
         "--precision",
@@ -93,7 +94,18 @@ def get_args():
     )
 
     parser.add_argument(
-        "--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
+        "--model_type",
+        type=str,
+        required=True,
+        default="gpt",
+        choices=["gpt", "sft", "bert"],
+    ),
+    parser.add_argument(
+        "--ckpt_format",
+        type=str,
+        required=False,
+        default="torch_dist",
+        choices=["zarr", "torch_dist"],
     )
 
     args = parser.parse_args()
@@ -114,7 +126,7 @@ def convert(local_rank, rank, world_size, args):
             'precision': args.precision,
         },
         'model': {
-            'native_amp_init_scale': 2 ** 32,
+            'native_amp_init_scale': 2**32,
             'native_amp_growth_interval': 1000,
             'hysteresis': 2,
             'gradient_as_bucket_view': True,
@@ -167,7 +179,7 @@ def convert(local_rank, rank, world_size, args):
         )
 
     with open_dict(model.cfg):
-        model.cfg.torch_distributed_checkpoint = True
+        model.cfg.dist_ckpt_format = args.ckpt_format
 
     model._save_restore_connector = NLPSaveRestoreConnector()
     save_file_path = args.path_to_save
diff --git a/tests/collections/llm/bitexact/mixtral/run.sh b/tests/collections/llm/bitexact/mixtral/run.sh
index c32dbbc95b98..0fe9e331b18a 100644
--- a/tests/collections/llm/bitexact/mixtral/run.sh
+++ b/tests/collections/llm/bitexact/mixtral/run.sh
@@ -43,4 +43,4 @@ python3 /workspace/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.
 
 # Compare outputs
 python3 /workspace/tests/collections/llm/bitexact/mixtral/compare_ckpts.py \
-  "$NEMO_OUTPUT_PATH/checkpoints/--None=0.0000-epoch=0/" "$MCORE_OUTPUT_PATH/iter_0000010/"
+  "$NEMO_OUTPUT_PATH/checkpoints/--None=0.0000-epoch=0/weights" "$MCORE_OUTPUT_PATH/iter_0000010/"
diff --git a/tests/collections/llm/gpt/model/test_mistral.py b/tests/collections/llm/gpt/model/test_mistral.py
index 365bb35b2725..025ea35dd6e9 100644
--- a/tests/collections/llm/gpt/model/test_mistral.py
+++ b/tests/collections/llm/gpt/model/test_mistral.py
@@ -1,6 +1,6 @@
 import torch.nn.functional as F
 
-from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralNeMo2407Config12B, MistralNeMo2407Config123B
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralNeMoConfig12B, MistralNeMoConfig123B
 
 
 def test_mistral_config7b():
@@ -25,7 +25,7 @@ def test_mistral_config7b():
 
 
 def test_mistral_nemo_config_12b():
-    config = MistralNeMo2407Config12B()
+    config = MistralNeMoConfig12B()
     assert config.normalization == "RMSNorm"
     assert config.activation_func == F.silu
     assert config.position_embedding_type == "rope"
@@ -49,7 +49,7 @@ def test_mistral_nemo_config_12b():
 
 
 def test_mistral_nemo_config_123b():
-    config = MistralNeMo2407Config123B()
+    config = MistralNeMoConfig123B()
     assert config.normalization == "RMSNorm"
     assert config.activation_func == F.silu
     assert config.position_embedding_type == "rope"
diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py
index 9eca287669cd..7eaa7744729c 100644
--- a/tests/collections/llm/gpt_finetuning.py
+++ b/tests/collections/llm/gpt_finetuning.py
@@ -19,6 +19,7 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
 ## NOTE: This script is present for github-actions testing only.
@@ -43,6 +44,7 @@ def get_args():
     parser.add_argument('--mbs', type=int, default=1, help="micro batch size")
     parser.add_argument('--tp_size', type=int, default=1, help="tensor parallel size")
     parser.add_argument('--pp_size', type=int, default=1, help="pipeline parallel size")
+    parser.add_argument('--packed', action='store_true', help="use packed sequence dataset")
 
     return parser.parse_args()
 
@@ -97,7 +99,16 @@ def get_args():
     else:
         peft = None
 
-    squad = llm.SquadDataModule(seq_length=2048, micro_batch_size=args.mbs, global_batch_size=8, num_workers=0)
+    packed_sequence_specs = (
+        PackedSequenceSpecs(packed_sequence_size=2048, tokenizer_model_name="dummy_tokenizer") if args.packed else None
+    )
+    dolly = llm.DollyDataModule(
+        seq_length=2048,
+        micro_batch_size=args.mbs,
+        global_batch_size=8,
+        num_workers=0,
+        packed_sequence_specs=packed_sequence_specs,
+    )
 
     tokenizer = get_nmt_tokenizer(tokenizer_model=os.path.join(args.restore_path, "dummy_tokenizer.model"))
     llama3_8b = llm.LlamaModel(Llama3ConfigCI(), tokenizer=tokenizer)
@@ -109,7 +120,7 @@ def get_args():
 
     llm.finetune(
         model=llama3_8b,
-        data=squad,
+        data=dolly,
         trainer=trainer,
         peft=peft,
         log=logger,
diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py
index 82188f75351e..b4c5b960e0a7 100644
--- a/tests/collections/llm/megatron_mixtral_pretraining.py
+++ b/tests/collections/llm/megatron_mixtral_pretraining.py
@@ -158,7 +158,7 @@ def main(args):
     )
 
     # Confirm checkpoint directory structure
-    output_path = Path(args.experiment_dir) / "checkpoints/--None=0.0000-epoch=0/"
+    output_path = Path(args.experiment_dir) / "checkpoints/--None=0.0000-epoch=0/weights"
     assert output_path.exists(), f"Expected {output_path} to exist"
     assert output_path.is_dir(), f"Expected {output_path} to be a directory"
     output_files = ['__0_0.distcp', '__0_1.distcp', 'common.pt', 'metadata.json', '.metadata']
diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
index 76a23d36975b..f54e858cfb43 100644
--- a/tests/collections/llm/megatron_t5_finetuning.py
+++ b/tests/collections/llm/megatron_t5_finetuning.py
@@ -21,6 +21,7 @@ def get_args():
     parser = argparse.ArgumentParser(description='Train a small T5 model using NeMo 2.0')
     parser.add_argument('--devices', type=int, help="Number of devices to use for training")
     parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
+    parser.add_argument('--peft', type=str, default='none', help="none | lora")
     parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
     parser.add_argument('--experiment-name', type=str, help="name of experiment")
     parser.add_argument('--wandb-project', type=str, default=None, help="wandb project name")
@@ -34,9 +35,12 @@ def get_args():
 
     args = get_args()
 
+    special_tokens = {}
+    special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
     tokenizer = get_nmt_tokenizer(
         "megatron",
         "BertWordPieceCase",
+        special_tokens=special_tokens,
     )
 
     data = SquadDataModule(
@@ -69,7 +73,6 @@ def get_args():
         pipeline_model_parallel_size=1,
         pipeline_dtype=torch.float32,
         ckpt_load_optimizer=False,
-        # ckpt_load_optimizer=True,
     )
     checkpoint_callback = ModelCheckpoint(
         every_n_train_steps=5000,
@@ -93,6 +96,11 @@ def get_args():
         config=opt_config,
     )
 
+    if args.peft == 'lora':
+        peft = llm.peft.LoRA()
+    else:
+        peft = None
+
     trainer = nl.Trainer(
         devices=args.devices,
         max_steps=args.max_steps,
@@ -125,6 +133,7 @@ def get_args():
         resume=resume,
         data=data,
         trainer=trainer,
+        peft=peft,
         log=nemo_logger,
         optim=opt,
     )
diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
index 5d8f55a7f26f..a5460be3d154 100644
--- a/tests/collections/llm/megatron_t5_pretraining.py
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -50,10 +50,13 @@ def get_args():
 
     args = get_args()
 
+    special_tokens = {}
+    special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
     tokenizer = get_nmt_tokenizer(
         "megatron",
         "BertWordPieceCase",
         vocab_file=args.vocab_path,
+        special_tokens=special_tokens,
     )
     data = PreTrainingDataModule(
         paths=args.data_path,
diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py
index a842975846dd..d47b674b7b70 100644
--- a/tests/collections/llm/recipes/test_llama3_70b.py
+++ b/tests/collections/llm/recipes/test_llama3_70b.py
@@ -31,7 +31,7 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
@@ -79,10 +79,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_pretrain_recipe_performance(self, recipe_module):
-        recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
-        )
+    def test_pretrain_performance_optimizations(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe(performance_mode=True)
         assert any(
             isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback
             for cb in recipe.trainer.callbacks
diff --git a/tests/collections/llm/recipes/test_llama3_70b_16k.py b/tests/collections/llm/recipes/test_llama3_70b_16k.py
index 60940b062a87..17f0ec5ebd99 100644
--- a/tests/collections/llm/recipes/test_llama3_70b_16k.py
+++ b/tests/collections/llm/recipes/test_llama3_70b_16k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 2
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 2
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 2
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_70b_64k.py b/tests/collections/llm/recipes/test_llama3_70b_64k.py
index 89813162fae1..e9f496dfdd2e 100644
--- a/tests/collections/llm/recipes/test_llama3_70b_64k.py
+++ b/tests/collections/llm/recipes/test_llama3_70b_64k.py
@@ -38,7 +38,7 @@ def test_trainer(self, recipe_module):
         assert trainer_config.strategy.tensor_model_parallel_size == 8
         assert trainer_config.strategy.pipeline_model_parallel_size == 4
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 8
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -67,14 +67,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 8
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 8
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py
index df4f05eec2ae..88fab6d6325a 100644
--- a/tests/collections/llm/recipes/test_llama3_8b.py
+++ b/tests/collections/llm/recipes/test_llama3_8b.py
@@ -90,10 +90,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_pretrain_recipe_performance(self, recipe_module):
-        recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
-        )
+    def test_pretrain_performance_optimizations(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe(performance_mode=True)
         assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks)
 
     def test_trainer_parallelism_options(self, recipe_module):
diff --git a/tests/collections/llm/recipes/test_llama3_8b_16k.py b/tests/collections/llm/recipes/test_llama3_8b_16k.py
index d7f3bd40ecb7..fe75f01236ab 100644
--- a/tests/collections/llm/recipes/test_llama3_8b_16k.py
+++ b/tests/collections/llm/recipes/test_llama3_8b_16k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 2
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 2
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 2
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_8b_64k.py b/tests/collections/llm/recipes/test_llama3_8b_64k.py
index f489e12dc55f..0316b736341a 100644
--- a/tests/collections/llm/recipes/test_llama3_8b_64k.py
+++ b/tests/collections/llm/recipes/test_llama3_8b_64k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 4
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_mistral.py b/tests/collections/llm/recipes/test_mistral.py
index 490f26a363fc..a7d83edcc370 100644
--- a/tests/collections/llm/recipes/test_mistral.py
+++ b/tests/collections/llm/recipes/test_mistral.py
@@ -6,7 +6,7 @@
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
 from nemo.collections.llm.peft.lora import LoRA
-from nemo.collections.llm.recipes import mistral
+from nemo.collections.llm.recipes import mistral_7b as mistral
 from nemo.lightning import AutoResume, Trainer
 
 
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
index 9f52b7117e82..62d6e0e31917 100644
--- a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
@@ -31,15 +31,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 2
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
         assert trainer_config.strategy.expert_model_parallel_size == 1
@@ -69,15 +69,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
-        assert trainer_config.strategy.context_parallel_size == 4
-        assert trainer_config.strategy.sequence_parallel is True
-        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
index f508e6dfd585..9ff93a89f438 100644
--- a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
@@ -35,11 +35,11 @@ def test_trainer(self, recipe_module):
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
         assert trainer_config.strategy.pipeline_model_parallel_size == 4
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.context_parallel_size == 8
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
+        assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
         assert trainer_config.strategy.expert_model_parallel_size == 1
 
@@ -63,15 +63,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.context_parallel_size == 8
-        assert trainer_config.strategy.sequence_parallel is True
-        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_nemotron4_15b_16k.py b/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
index e0b4e1f56eb8..6c1f5d90e160 100644
--- a/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_15b_64k.py b/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
index 9525039eb90e..8ed35fb81893 100644
--- a/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_22b_16k.py b/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
index 1e501b447d45..6b4a581348e0 100644
--- a/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_22b_64k.py b/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
index c37a45793aff..68a238a93338 100644
--- a/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py
index 3f0b804e8bd6..a5c2aa96fc03 100644
--- a/tests/collections/llm/test_mnist_model_nemo2.py
+++ b/tests/collections/llm/test_mnist_model_nemo2.py
@@ -501,6 +501,7 @@ def run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu():
                 monitor="val_loss",
                 save_top_k=1,
                 every_n_train_steps=5,
+                filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
                 # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
                 always_save_context=True,
             )
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index a0b69927ecc0..d4b1d37c1938 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -29,6 +29,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.constants import NEMO_ENV_VARNAME_VERSION
 from nemo.core.classes import ModelPT
+from nemo.utils.app_state import AppState
 from nemo.utils.callbacks import NeMoModelCheckpoint
 from nemo.utils.exp_manager import (
     CheckpointMisconfigurationError,
@@ -1097,3 +1098,74 @@ def test_doesnt_silently_start_from_scratch_dist(self, tmp_path):
                 restored_trainer,
                 {"resume_if_exists": True, "resume_ignore_no_checkpoint": True, "explicit_log_dir": str(test_dir)},
             )
+
+    @pytest.mark.unit
+    def test_save_nemo_not_comp_with_model_parallel(self, tmp_path):
+        """
+        Ensure that always_save_nemo is not compatible with model parallelism.
+        """
+
+        test_dir = tmp_path / "test"
+
+        with pytest.raises(LoggerMisconfigurationError):
+            appstate = AppState()
+            appstate.tensor_model_parallel_size = 2
+            appstate.pipeline_model_parallel_size = 1
+            appstate.context_parallel_size = 1
+            test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1)
+            exp_manager(
+                test_trainer,
+                {
+                    "checkpoint_callback_params": {
+                        "always_save_nemo": True,
+                    },
+                    "explicit_log_dir": str(test_dir),
+                },
+            )
+
+        with pytest.raises(LoggerMisconfigurationError):
+            appstate = AppState()
+            appstate.tensor_model_parallel_size = 1
+            appstate.pipeline_model_parallel_size = 2
+            appstate.context_parallel_size = 1
+            test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1)
+            exp_manager(
+                test_trainer,
+                {
+                    "checkpoint_callback_params": {
+                        "always_save_nemo": True,
+                    },
+                    "explicit_log_dir": str(test_dir),
+                },
+            )
+
+        with pytest.raises(LoggerMisconfigurationError):
+            appstate = AppState()
+            appstate.tensor_model_parallel_size = 1
+            appstate.pipeline_model_parallel_size = 1
+            appstate.context_parallel_size = 2
+            test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1)
+            exp_manager(
+                test_trainer,
+                {
+                    "checkpoint_callback_params": {
+                        "always_save_nemo": True,
+                    },
+                    "explicit_log_dir": str(test_dir),
+                },
+            )
+
+        appstate = AppState()
+        appstate.tensor_model_parallesl_size = 1
+        appstate.pipeline_model_parallel_size = 1
+        appstate.context_parallel_size = 1
+        test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1)
+        exp_manager(
+            test_trainer,
+            {
+                "checkpoint_callback_params": {
+                    "always_save_nemo": True,
+                },
+                "explicit_log_dir": str(test_dir),
+            },
+        )
diff --git a/tests/lightning/_io/test_api.py b/tests/lightning/_io/test_api.py
index 386bd5b5fdab..a4d458cef17b 100644
--- a/tests/lightning/_io/test_api.py
+++ b/tests/lightning/_io/test_api.py
@@ -16,6 +16,7 @@
 from functools import partial
 from pathlib import Path
 
+import fiddle as fdl
 import pytest
 import yaml
 from pytorch_lightning.loggers import TensorBoardLogger
@@ -69,9 +70,9 @@ def test_reload_ckpt(self, tmpdir, partial_function_with_pos_and_key_args):
         loaded_func = loaded.extra["dummy"]
         assert loaded_func(b=2) == partial_function_with_pos_and_key_args(b=2)
 
-        model_yaml = Path(tmpdir) / "model.yaml"
-        assert model_yaml.exists()
-
-        observed = yaml.safe_load(model_yaml.read_text())
-        expected = yaml.safe_load((Path(ARTIFACTS_DIR) / "model.yaml").read_text())
-        assert observed.keys() == expected.keys()
+        config = io.load_context(tmpdir, build=False)
+        assert isinstance(config, fdl.Config)
+        assert config.model.config.seq_length == ckpt.model.config.seq_length
+        assert config.model.tokenizer.vocab_file.startswith(str(tmpdir))
+        assert config.model.tokenizer.merges_file.startswith(str(tmpdir))
+        assert config.extra["dummy"] == fdl.Partial(dummy_extra, 10, c=15)
diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py
index 53f9016a3bac..95caca4d2784 100644
--- a/tests/lightning/pytorch/callbacks/test_peft.py
+++ b/tests/lightning/pytorch/callbacks/test_peft.py
@@ -18,6 +18,7 @@
 from pytorch_lightning.trainer.states import TrainerFn
 from nemo.collections.llm import fn
 from nemo.lightning.pytorch.callbacks.peft import PEFT, WrappedAdapterIO
+from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
 
 
 class TestPEFT:
@@ -48,7 +49,8 @@ def test_peft_setup(self):
         pl_module.model_transform = peft
         peft.setup(trainer, pl_module, "fit")
 
-        assert isinstance(trainer.strategy._checkpoint_io, WrappedAdapterIO)
+        assert isinstance(trainer.strategy._checkpoint_io, AsyncFinalizableCheckpointIO)
+        assert isinstance(trainer.strategy._checkpoint_io._checkpoint_io, WrappedAdapterIO)
         assert peft.model_transform is not None
         assert peft._needs_to_call is True
 
diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py
index d5037f0aa573..886b1085ed55 100644
--- a/tests/lightning/test_dist_ckpt.py
+++ b/tests/lightning/test_dist_ckpt.py
@@ -34,6 +34,7 @@ def set_env():
 def _get_strategy():
     strategy = nl.MegatronStrategy(
         enable_nemo_ckpt_io=False,
+        ckpt_async_save=False,
     )
     return strategy
 
diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py
index 8d7814bfe530..934eaa853bf0 100644
--- a/tests/lightning/test_nemo_run.py
+++ b/tests/lightning/test_nemo_run.py
@@ -17,8 +17,8 @@
         ("llama3_70b_16k", "pretrain_recipe", "llama3_70b_16k_pretrain"),
         ("llama3_70b_64k", "pretrain_recipe", "llama3_70b_64k_pretrain"),
         ("llama31_405b", "pretrain_recipe", "llama31_405b_pretrain"),
-        ("mistral", "pretrain_recipe", "mistral_pretrain"),
-        ("mistral", "finetune_recipe", "mistral_finetune"),
+        ("mistral_7b", "pretrain_recipe", "mistral_pretrain"),
+        ("mistral_7b", "finetune_recipe", "mistral_finetune"),
         ("mixtral_8x7b", "pretrain_recipe", "mixtral_8x7b_pretrain"),
         ("mixtral_8x7b", "finetune_recipe", "mixtral_8x7b_finetune"),
         ("mixtral_8x7b_16k", "pretrain_recipe", "mixtral_8x7b_16k_pretrain"),
@@ -36,6 +36,7 @@
         ("nemotron4_22b_64k", "pretrain_recipe", "nemotron4_22b_64k_pretrain"),
         ("nemotron4_340b", "pretrain_recipe", "nemotron4_340b_pretrain"),
         ("nemotron4_340b", "finetune_recipe", "nemotron4_340b_finetune"),
+        ("gpt3_175b", "pretrain_recipe", "gpt3_175b_pretrain"),
     ],
 )
 def test_recipes_with_nemo_run(module, recipe, name, tmpdir, monkeypatch):
diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
deleted file mode 100644
index 608685254a0d..000000000000
--- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
+++ /dev/null
@@ -1,827 +0,0 @@
-{
-    "cells": [
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "\"\"\"\n",
-                "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-                "\n",
-                "Instructions for setting up Colab are as follows:\n",
-                "1. Open a new Python 3 notebook.\n",
-                "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-                "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-                "4. Run this cell to set up dependencies.\n",
-                "\"\"\"\n",
-                "# If you're using Google Colab and not running locally, run this cell\n",
-                "\n",
-                "# install NeMo\n",
-                "BRANCH = 'main'\n",
-                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "from nemo.collections import nlp as nemo_nlp\n",
-                "from nemo.utils.exp_manager import exp_manager\n",
-                "from nemo.utils import logging\n",
-                "\n",
-                "import os\n",
-                "import wget\n",
-                "import torch\n",
-                "import pytorch_lightning as pl\n",
-                "from omegaconf import OmegaConf"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Task Description\n",
-                "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n",
-                "for this Intent in a query.\n",
-                "For example, in the query:  `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n",
-                "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n",
-                "Intents and Slots names are usually task specific and defined as labels in the training data.\n",
-                "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n",
-                "\n",
-                "Our Bert based model implementation enables to train and then detect both of these tasks together.\n",
-                "\n",
-                "**Multi Label Joint Intent and Slot classification** - is very similar to the task above, but instead of only classifying a single Intent, the task can predict multiple different intents for each query. For example, for the query `Yes, please tell me the weather`, we might want the intents for this utterance to be `yes` and `weather`. You can skip to that tutorial [here](#multi-label)\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Dataset and NeMo data format\n",
-                "\n",
-                "In this tutorial we are going to use a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n",
-                "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n",
-                "\n",
-                "To work with NeMo NLP classification model, this dataset should be first converted to the NeMo format, which requires next files:\n",
-                "- **dict.intents.csv** - list of all intent names in the data. One line per an intent name.\n",
-                "- **dict.slots.csv** - list of all slot names in the data. One line per a slot name. It is possible to use both: B- I- notations, for separating between first and intermediate tokens for multi token slots. Or just use one slot type for each token of multi token slot. Our recommendation is to use later one, since it is simpler and there is no visible degradation in performance.\n",
-                "- **train.tsv/test.tsv** - contain original queries, one per line, and intent number separated by tab. For example: `what alarms do i have set right now\t0`. Intent numbers are according to the intent line in the intent dictionary file (dict.intents.csv) starting from 0. First line of these files contains a header line: `sentence \\tab label`.\n",
-                "- **train_slot.tvs/test_slot.tsv** - contain one line per a query, where instead each token there is a number of the token from the slots dictionary file (dict.slots.csv), starting from 0. Last 'out-of scope' token is usually located in the last line of the dictionary. Example: `54 0 0 54 54 12 12` (numbers separated by space). No header line in these files.\n",
-                "\n",
-                "NeMo provides **import_dataset.py** converter for few reference datasets (Assistant / Atis / Snips) which converts them to the NeMo data format for the Intent and Slot classification model. If you have your own annotated dataset in a different format, you will need to write a data converter. Possible recommended format for your own annotation, is to have one text file per all examples of one intent. With one line per query in a form like: `did i set an alarm to [alarm_type : wake up] in the [timeofday : morning]`, using brackets to define slot names. This is very similar to the assistant format from this example and you can use its converter to NeMo format with small changes. \n",
-                "\n",
-                "You can run this utility as follows:\n",
-                "\n",
-                "**python examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=source_dir_name --target_data_dir=target_dir_name**\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Download, preprocess and explore the dataset\n",
-                "## Download the dataset and convert it to the NeMo format"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# you can replace DATA_DIR and NEMO_DIR with your own locations\n",
-                "DATA_DIR = \".\"\n",
-                "NEMO_DIR = '.'\n",
-                "\n",
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "# download and unzip the example dataset from github\n",
-                "print('Downloading dataset...')\n",
-                "wget.download('https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip', DATA_DIR)\n",
-                "! unzip {DATA_DIR}/NLU-Evaluation-Data-master.zip -d {DATA_DIR}"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# convert the dataset to the NeMo format\n",
-                "!python {NEMO_DIR}/import_datasets.py --dataset_name=assistant --source_data_dir={DATA_DIR}/NLU-Evaluation-Data-master --target_data_dir={DATA_DIR}/nemo_format\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Data exploration\n",
-                "You can see the dataset in both the original and NeMo's formats. We have here 65 different Intents and 55 Slots, which could be typical commands for virtual assistants. Out of scope slot has the name 'O' and is the last in the dictionary of Slots. And we can see examples of queries and also format of training intent and slot files. "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# list of queries divided by intent files in the original training dataset\n",
-                "! ls -l {DATA_DIR}/NLU-Evaluation-Data-master/dataset/trainset"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# print all intents from the NeMo format intent dictionary\n",
-                "!echo 'Intents: ' $(wc -l < {DATA_DIR}/nemo_format/dict.intents.csv)\n",
-                "! cat {DATA_DIR}/nemo_format/dict.intents.csv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# print all slots from the NeMo format slot dictionary\n",
-                "!echo 'Slots: ' $(wc -l < {DATA_DIR}/nemo_format/dict.slots.csv)\n",
-                "! cat {DATA_DIR}/nemo_format/dict.slots.csv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# examples from the intent training file\n",
-                "! head -n 10 {DATA_DIR}/nemo_format/train.tsv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# examples from the slot training file\n",
-                "! head -n 10 {DATA_DIR}/nemo_format/train_slots.tsv"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Training model"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Model configuration\n",
-                "\n",
-                "Our Joint Intent and Slot classification model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model with an Intent and Slot Classification layer on top of it.\n",
-                "\n",
-                "All model and training parameters are defined in the **intent_slot_classification_config.yaml** config file. This file is located in the folder **examples/nlp/intent_slot_classification/conf/**. It contains 2 main sections:\n",
-                "- **model**: All arguments that are related to the Model - language model, token classifier, optimizer and schedulers, datasets and any other related information\n",
-                "\n",
-                "- **trainer**: Any argument to be passed to PyTorch Lightning\n",
-                "\n",
-                "We will download the config file from repository for the purpose of the tutorial. If you have a version of NeMo installed locally, you can use it from the above folder."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the model config file from repository for the purpose of this example\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml', NEMO_DIR)\n",
-                "\n",
-                "# print content of the config file\n",
-                "config_file = \"intent_slot_classification_config.yaml\"\n",
-                "print(config_file)\n",
-                "config = OmegaConf.load(config_file)\n",
-                "print(OmegaConf.to_yaml(config))"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Setting up Data within the config\n",
-                "\n",
-                "Among other things, the config file contains dictionaries called train_ds and validation_ds. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n",
-                "\n",
-                "The converter utility creates both training and evaluation files in the same directory, so we need to specify `model.data_dir` parameter to this directory. Also notice that some config lines, including `model.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n",
-                "\n",
-                "`config.model.intent_loss_weight` parameter - is a balance of training loss between Intent and Slot losses, a number between 0 to 1. Its default value is 0.6 which gives slightly higher priority to the Intent loss and it empirically works quite well. You can experiment with this value if you like.\n",
-                "Also you can try to change `config.model.class_balancing` parameter to `weighted_loss` and see if you get better accuracy.\n",
-                "\n",
-                "Let's now add the data directory path to the config."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "config.model.data_dir = f'{DATA_DIR}/nemo_format'"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Building the PyTorch Lightning Trainer\n",
-                "\n",
-                "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem. `config.trainer.max_epochs` - param defines number of training epochs. Usually 50-100 epochs or less should be enough to train on your data. Let's instantiate the Trainer object."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# lets modify some trainer configs\n",
-                "# checks if we have GPU available and uses it\n",
-                "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-                "config.trainer.devices = 1\n",
-                "config.trainer.accelerator = accelerator\n",
-                "\n",
-                "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n",
-                "\n",
-                "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n",
-                "# config.trainer.amp_level = O1\n",
-                "\n",
-                "# remove distributed training flags\n",
-                "config.trainer.strategy = 'auto'\n",
-                "\n",
-                "# setup a small number of epochs for demonstration purposes of this tutorial\n",
-                "config.trainer.max_epochs = 5\n",
-                "\n",
-                "trainer = pl.Trainer(**config.trainer)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Setting up a NeMo Experiment\n",
-                "\n",
-                "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it. Model check points during training will be saved in this directory. "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n",
-                "# the exp_dir provides a path to the current experiment for easy access\n",
-                "print(str(exp_dir))"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Initializing the model and Training\n",
-                "\n",
-                "Initial statistics of the dataset will be displayed at the beginning of the training and then Intent and Slot classification report will be displayed after each training epoch."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# initialize the model\n",
-                "model = nemo_nlp.models.IntentSlotClassificationModel(config.model, trainer=trainer)\n",
-                "\n",
-                "# train\n",
-                "trainer.fit(model)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "After training for 5 epochs, which should take no more than few minutes, you can expect training precision for this data set to be around these numbers (the accuracy will gradually continue to improve for this dataset up to about 50 epochs of training): \n",
-                "```\n",
-                "Intents:\n",
-                "    label                                                precision    recall       f1           support   \n",
-                "    alarm_query (label_id: 0)                               94.74      94.74      94.74         19\n",
-                "    alarm_remove (label_id: 1)                             100.00     100.00     100.00         11\n",
-                "    alarm_set (label_id: 2)                                 85.71      94.74      90.00         19\n",
-                "    audio_volume_down (label_id: 3)                          0.00       0.00       0.00          8\n",
-                "    audio_volume_mute (label_id: 4)                        100.00      86.67      92.86         15\n",
-                "    audio_volume_up (label_id: 5)                           56.52     100.00      72.22         13\n",
-                "    calendar_query (label_id: 6)                            55.00      57.89      56.41         19\n",
-                "    calendar_remove (label_id: 7)                           88.89      84.21      86.49         19\n",
-                "    calendar_set (label_id: 8)                              81.25      68.42      74.29         19\n",
-                "    cooking_recipe (label_id: 9)                            86.36     100.00      92.68         19\n",
-                "    datetime_convert (label_id: 10)                          0.00       0.00       0.00          8\n",
-                "    datetime_query (label_id: 11)                           65.52     100.00      79.17         19\n",
-                "    email_addcontact (label_id: 12)                        100.00      12.50      22.22          8\n",
-                "    email_query (label_id: 13)                              83.33      78.95      81.08         19\n",
-                "    email_querycontact (label_id: 14)                       62.50      78.95      69.77         19\n",
-                "    email_sendemail (label_id: 15)                          70.83      89.47      79.07         19\n",
-                "    general_affirm (label_id: 16)                           95.00     100.00      97.44         19\n",
-                "    general_commandstop (label_id: 17)                     100.00     100.00     100.00         19\n",
-                "    general_confirm (label_id: 18)                         100.00     100.00     100.00         19\n",
-                "    general_dontcare (label_id: 19)                        100.00     100.00     100.00         19\n",
-                "    general_explain (label_id: 20)                         100.00      94.74      97.30         19\n",
-                "    general_joke (label_id: 21)                            100.00     100.00     100.00         12\n",
-                "    general_negate (label_id: 22)                           95.00     100.00      97.44         19\n",
-                "    general_praise (label_id: 23)                          100.00      94.74      97.30         19\n",
-                "    general_quirky (label_id: 24)                           40.00      10.53      16.67         19\n",
-                "    general_repeat (label_id: 25)                          100.00     100.00     100.00         19\n",
-                "    iot_cleaning (label_id: 26)                             84.21     100.00      91.43         16\n",
-                "    iot_coffee (label_id: 27)                               94.74      94.74      94.74         19\n",
-                "    iot_hue_lightchange (label_id: 28)                      94.44      89.47      91.89         19\n",
-                "    iot_hue_lightdim (label_id: 29)                        100.00      83.33      90.91         12\n",
-                "    iot_hue_lightoff (label_id: 30)                         89.47      89.47      89.47         19\n",
-                "    iot_hue_lighton (label_id: 31)                           0.00       0.00       0.00          3\n",
-                "    iot_hue_lightup (label_id: 32)                          81.25      92.86      86.67         14\n",
-                "    iot_wemo_off (label_id: 33)                             60.00     100.00      75.00          9\n",
-                "    iot_wemo_on (label_id: 34)                             100.00      14.29      25.00          7\n",
-                "    lists_createoradd (label_id: 35)                        78.95      78.95      78.95         19\n",
-                "    lists_query (label_id: 36)                              78.95      78.95      78.95         19\n",
-                "    lists_remove (label_id: 37)                             90.00      94.74      92.31         19\n",
-                "    music_likeness (label_id: 38)                           70.59      66.67      68.57         18\n",
-                "    music_query (label_id: 39)                              77.78      73.68      75.68         19\n",
-                "    music_settings (label_id: 40)                            0.00       0.00       0.00          7\n",
-                "    news_query (label_id: 41)                               77.78      73.68      75.68         19\n",
-                "    play_audiobook (label_id: 42)                           90.00      94.74      92.31         19\n",
-                "    play_game (label_id: 43)                                80.00      84.21      82.05         19\n",
-                "    play_music (label_id: 44)                               53.85      73.68      62.22         19\n",
-                "    play_podcasts (label_id: 45)                            89.47      89.47      89.47         19\n",
-                "    play_radio (label_id: 46)                               93.75      78.95      85.71         19\n",
-                "    qa_currency (label_id: 47)                              95.00     100.00      97.44         19\n",
-                "    qa_definition (label_id: 48)                            85.00      89.47      87.18         19\n",
-                "    qa_factoid (label_id: 49)                               45.16      73.68      56.00         19\n",
-                "    qa_maths (label_id: 50)                                100.00     100.00     100.00         14\n",
-                "    qa_stock (label_id: 51)                                 95.00     100.00      97.44         19\n",
-                "    recommendation_events (label_id: 52)                    94.44      89.47      91.89         19\n",
-                "    recommendation_locations (label_id: 53)                 94.74      94.74      94.74         19\n",
-                "    recommendation_movies (label_id: 54)                   100.00     100.00     100.00         10\n",
-                "    social_post (label_id: 55)                              90.00      94.74      92.31         19\n",
-                "    social_query (label_id: 56)                             94.74     100.00      97.30         18\n",
-                "    takeaway_order (label_id: 57)                           93.75      78.95      85.71         19\n",
-                "    takeaway_query (label_id: 58)                           85.71      94.74      90.00         19\n",
-                "    transport_query (label_id: 59)                          83.33      78.95      81.08         19\n",
-                "    transport_taxi (label_id: 60)                          100.00     100.00     100.00         18\n",
-                "    transport_ticket (label_id: 61)                         89.47      89.47      89.47         19\n",
-                "    transport_traffic (label_id: 62)                       100.00     100.00     100.00         19\n",
-                "    weather_query (label_id: 63)                           100.00      89.47      94.44         19\n",
-                "    -------------------\n",
-                "    micro avg                                               85.04      85.04      85.04       1076\n",
-                "    macro avg                                               81.13      80.81      79.36       1076\n",
-                "    weighted avg                                            84.10      85.04      83.54       1076\n",
-                "    \n",
-                "Slots:\n",
-                "    label                                                precision    recall       f1           support   \n",
-                "    alarm_type (label_id: 0)                                 0.00       0.00       0.00          0\n",
-                "    app_name (label_id: 1)                                   0.00       0.00       0.00          6\n",
-                "    artist_name (label_id: 2)                                0.00       0.00       0.00         21\n",
-                "    audiobook_author (label_id: 3)                           0.00       0.00       0.00          1\n",
-                "    audiobook_name (label_id: 4)                             0.00       0.00       0.00         18\n",
-                "    business_name (label_id: 5)                             60.00      56.60      58.25         53\n",
-                "    business_type (label_id: 6)                              0.00       0.00       0.00         24\n",
-                "    change_amount (label_id: 7)                              0.00       0.00       0.00         25\n",
-                "    coffee_type (label_id: 8)                                0.00       0.00       0.00          4\n",
-                "    color_type (label_id: 9)                                 0.00       0.00       0.00         12\n",
-                "    cooking_type (label_id: 10)                              0.00       0.00       0.00          0\n",
-                "    currency_name (label_id: 11)                            84.09      75.51      79.57         49\n",
-                "    date (label_id: 12)                                     57.95      91.07      70.83        112\n",
-                "    definition_word (label_id: 13)                           0.00       0.00       0.00         20\n",
-                "    device_type (label_id: 14)                              74.55      51.25      60.74         80\n",
-                "    drink_type (label_id: 15)                                0.00       0.00       0.00          0\n",
-                "    email_address (label_id: 16)                             0.00       0.00       0.00         14\n",
-                "    email_folder (label_id: 17)                              0.00       0.00       0.00          1\n",
-                "    event_name (label_id: 18)                              100.00      13.24      23.38         68\n",
-                "    food_type (label_id: 19)                                51.72      69.77      59.41         43\n",
-                "    game_name (label_id: 20)                                60.00      14.29      23.08         21\n",
-                "    game_type (label_id: 21)                                 0.00       0.00       0.00          0\n",
-                "    general_frequency (label_id: 22)                         0.00       0.00       0.00          9\n",
-                "    house_place (label_id: 23)                              93.33      42.42      58.33         33\n",
-                "    ingredient (label_id: 24)                                0.00       0.00       0.00          6\n",
-                "    joke_type (label_id: 25)                                 0.00       0.00       0.00          4\n",
-                "    list_name (label_id: 26)                                 0.00       0.00       0.00         21\n",
-                "    meal_type (label_id: 27)                                 0.00       0.00       0.00          0\n",
-                "    media_type (label_id: 28)                                0.00       0.00       0.00         37\n",
-                "    movie_name (label_id: 29)                                0.00       0.00       0.00          0\n",
-                "    movie_type (label_id: 30)                                0.00       0.00       0.00          0\n",
-                "    music_album (label_id: 31)                               0.00       0.00       0.00          0\n",
-                "    music_descriptor (label_id: 32)                          0.00       0.00       0.00          3\n",
-                "    music_genre (label_id: 33)                               0.00       0.00       0.00          9\n",
-                "    news_topic (label_id: 34)                                0.00       0.00       0.00         17\n",
-                "    order_type (label_id: 35)                                0.00       0.00       0.00         17\n",
-                "    person (label_id: 36)                                   44.86      92.31      60.38         52\n",
-                "    personal_info (label_id: 37)                             0.00       0.00       0.00         20\n",
-                "    place_name (label_id: 38)                               71.25      77.03      74.03        148\n",
-                "    player_setting (label_id: 39)                            0.00       0.00       0.00          1\n",
-                "    playlist_name (label_id: 40)                             0.00       0.00       0.00          1\n",
-                "    podcast_descriptor (label_id: 41)                        0.00       0.00       0.00         13\n",
-                "    podcast_name (label_id: 42)                              0.00       0.00       0.00          4\n",
-                "    radio_name (label_id: 43)                               66.67      10.53      18.18         38\n",
-                "    relation (label_id: 44)                                  0.00       0.00       0.00         17\n",
-                "    song_name (label_id: 45)                                 0.00       0.00       0.00         22\n",
-                "    time (label_id: 46)                                     70.27      78.20      74.02        133\n",
-                "    time_zone (label_id: 47)                                 0.00       0.00       0.00          9\n",
-                "    timeofday (label_id: 48)                                 0.00       0.00       0.00         28\n",
-                "    transport_agency (label_id: 49)                          0.00       0.00       0.00          9\n",
-                "    transport_descriptor (label_id: 50)                      0.00       0.00       0.00          0\n",
-                "    transport_name (label_id: 51)                            0.00       0.00       0.00          4\n",
-                "    transport_type (label_id: 52)                           78.38      82.86      80.56         35\n",
-                "    weather_descriptor (label_id: 53)                        0.00       0.00       0.00         17\n",
-                "    O (label_id: 54)                                        92.42      98.80      95.50       5920\n",
-                "    -------------------\n",
-                "    micro avg                                               89.10      89.10      89.10       7199\n",
-                "    macro avg                                               21.86      18.56      18.18       7199\n",
-                "    weighted avg                                            84.42      89.10      86.01       7199\n",
-                "```"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Evaluation\n",
-                "To see how the model performs, we can evaluate the performance of the trained model on a test data file. Here we will reload the model from the `.nemo` file saved during training. By default, the `.nemo` file contains the final checkpoint. We will use the same trainer for testing."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# extract the path of the best checkpoint from the training, you may update it to any other saved checkpoint file\n",
-                "checkpoint_path = trainer.checkpoint_callback.best_model_path\n",
-                "\n",
-                "# load the model from this checkpoint\n",
-                "eval_model = nemo_nlp.models.IntentSlotClassificationModel.load_from_checkpoint(checkpoint_path=checkpoint_path)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# we will setup testing data reusing the same config (test section)\n",
-                "eval_model.setup_test_data(test_data_config=config.model.test_ds)\n",
-                "\n",
-                "# run the evaluation on the test dataset\n",
-                "trainer.test(model=eval_model, ckpt_path=None, verbose=False)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Inference from Examples\n",
-                "Next step to see how the trained model will classify Intents and Slots for given queries from this domain. To improve the predictions you may need to train the model for more than 5 epochs.\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "queries = [\n",
-                "    'set alarm for seven thirty am',\n",
-                "    'lower volume by fifty percent',\n",
-                "    'what is my schedule for tomorrow',\n",
-                "]\n",
-                "\n",
-                "pred_intents, pred_slots = eval_model.predict_from_examples(queries, config.model.test_ds)\n",
-                "\n",
-                "logging.info('The prediction results of some sample queries with the trained model:')\n",
-                "for query, intent, slots in zip(queries, pred_intents, pred_slots):\n",
-                "    logging.info(f'Query : {query}')\n",
-                "    logging.info(f'Predicted Intent: {intent}')\n",
-                "    logging.info(f'Predicted Slots: {slots}')"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Training Script\n",
-                "\n",
-                "If you have NeMo installed locally (eg. cloned from the Github), you can also train the model with the example script: `examples/nlp/intent_slot_classification/intent_slot_classification.py.`\n",
-                "This script contains an example on how to train, evaluate and perform inference with the IntentSlotClassificationModel.\n",
-                "\n",
-                "To run a training script, use:\n",
-                "\n",
-                "`cd examples/nlp/intent_slot_classification`\n",
-                "\n",
-                "`python intent_slot_classification.py model.data_dir=PATH_TO_DATA_DIR`\n",
-                "\n",
-                "By default, this script uses examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.py config file, and you may update all the params inside of this config file or alternatively providing them in the command line."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "<a id='multi-label'></a>\n",
-                "# Multi-Label Intent Classification\n",
-                "---\n",
-                "\n",
-                "As mentioned above, our multi-label model will be very similar the single intent classification model, with the added functionality of predicting multiple different intents for a single query. For example, the query `show all flights and fares from denver to san francisco` would have intents `atis_airfare` and `atis_flight`. From our list of intents found in `dict.intents.csv`, the model checks whether each individual intent is suitable for the given query.\n",
-                "\n",
-                "For this tutorial, we will be using the ATIS (Airline Travel Information System) dataset, converting it to a multi-label data format, and then using the new data to train our model.\n",
-                "\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Download the dataset and convert it to the NeMo format"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "DATA_DIR = './multiatis'\n",
-                "NEMO_DIR = './atis'\n",
-                "\n",
-                "!mkdir {DATA_DIR}\n",
-                "!mkdir {NEMO_DIR}\n",
-                "\n",
-                "\n",
-                "files = [f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.intent.csv', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.slots.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.vocab.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.intent.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.pkl', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.query.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.slots.csv', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.intent.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.pkl',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.query.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.slots.csv']\n",
-                "\n",
-                "         \n",
-                "for file in files:\n",
-                "    wget.download(file, DATA_DIR)\n",
-                "\n",
-                "\n",
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py', NEMO_DIR)\n",
-                "\n",
-                "# Get original atis dataset\n",
-                "!python {NEMO_DIR}/import_datasets.py --dataset_name=atis --source_data_dir={DATA_DIR} --target_data_dir={DATA_DIR}/nemo_format\n",
-                "# Script will create new files at {DATA_DIR}/new_format\n",
-                "!mkdir {DATA_DIR}/new_format\n",
-                "!python {NEMO_DIR}/convert_datasets.py --source_data_dir={DATA_DIR}/nemo_format --target_data_dir={DATA_DIR}/new_format"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Data Augmentation (Optional)\n",
-                "---\n",
-                "\n",
-                "In scenarios when we don't have many training examples with multiple intent labels, data augmentation can be very useful. This can be done by concatenating utterances together, and adding it to our training data. Some ways of concatenating include adding a period or \\\"and\\\" between the two utterances. A script has been provided below to help with augmentation, but it can be changed depending on your use case."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the data augmentation script\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/augment_training_data.py', NEMO_DIR)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "The script augment_training_data.py allows for four command line arguments to be passed in: \n",
-                "\n",
-                "source_data_dir: directory that contains the original multi-label data <br>\n",
-                "target_data_dir: directory to store the new data directory <br>\n",
-                "num_mixed: number of new utterances to add to dataset per class pair (utterances with labels 1 and 2) <br>\n",
-                "link_string: string that is in between the two utterances (\".\", \"\", \"and\", \"with\") <br>"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "!python {NEMO_DIR}/augment_training_data.py --source_data_dir={DATA_DIR}/new_format --target_data_dir={DATA_DIR}/augmented_data --num_mixed=10"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Training the Model"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the model config file from repository for the purpose of this example\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml', NEMO_DIR)\n",
-                "\n",
-                "# print content of the config file\n",
-                "config_file = f\"{NEMO_DIR}/multi_label_intent_slot_classification_config.yaml\"\n",
-                "print(config_file)\n",
-                "config = OmegaConf.load(config_file)\n",
-                "print(OmegaConf.to_yaml(config))"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "config.model.data_dir = f\"{DATA_DIR}/new_format\"\n",
-                "config.model.validation_ds.prefix = \"dev\"\n",
-                "config.model.test_ds.prefix = \"dev\"\n",
-                "config.model.class_balancing = \"weighted_loss\"\n",
-                "config.trainer.max_epochs = 5\n",
-                "run_name = \"test\"\n",
-                "\n",
-                "# checks if we have GPU available and uses it\n",
-                "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-                "config.trainer.devices = 1\n",
-                "config.trainer.accelerator = accelerator\n",
-                "\n",
-                "# remove distributed training flags\n",
-                "config.trainer.strategy = 'auto'\n",
-                "\n",
-                "trainer = pl.Trainer(**config.trainer)\n",
-                "config.exp_manager.exp_dir = os.path.join(DATA_DIR, \"output/\" + run_name)\n",
-                "config.exp_manager.create_checkpoint_callback = True\n",
-                "\n",
-                "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n",
-                "model = nemo_nlp.models.MultiLabelIntentSlotClassificationModel(config.model, trainer=trainer)\n",
-                "trainer.fit(model)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Evaluation"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "To see how the model performs, we can evaluate the performance of the trained model on a test data file. Here we will reload the model from the `.nemo` file saved during training. By default, the `.nemo` file contains the final checkpoint. We will use the same trainer for testing."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# specify checkpoint path with .nemo file\n",
-                "checkpoint_path = os.path.join(exp_dir, \"checkpoints\", \"MultiLabelIntentSlot.nemo\")\n",
-                "\n",
-                "# load the model from this checkpoint\n",
-                "eval_model =  nemo_nlp.models.MultiLabelIntentSlotClassificationModel.restore_from(checkpoint_path)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "### Optimizing Threshold\n",
-                "\n",
-                "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
-                "\n",
-                "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "eval_model.optimize_threshold(config.model.test_ds, 'dev')"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "eval_model.threshold"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "###  Inference from Examples\n",
-                "Similar to the previous example we can run inference to see how the trained model will classify Intents and Slots for given queries from this domain. To improve the predictions you may need to train the model for more than 10 epochs.\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "queries = [\n",
-                "    'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis',\n",
-                "    'on april first i need a ticket from tacoma to san jose departing before 7 am',\n",
-                "    'how much is the limousine service in boston',\n",
-                "]\n",
-                "\n",
-                "# We use the optimized threshold for predictions\n",
-                "pred_intents, pred_slots, pred_list = eval_model.predict_from_examples(queries, config.model.test_ds)\n",
-                "logging.info('The prediction results of some sample queries with the trained model:')\n",
-                "    \n",
-                "for query, intent, slots in zip(queries, pred_intents, pred_slots):\n",
-                "    logging.info(f'Query : {query}')\n",
-                "    logging.info(f'Predicted Intents: {intent}')\n",
-                "    logging.info(f'Predicted Slots: {slots}')"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.8.12"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 1
-}