“13.0”

1106877d · jerrrrry · 1106877d · 1106877d · 1106877d · 1106877d
Commit 1106877d authored Sep 23, 2025 by jerrrrry
20 changed files
--- a/Megatron-LM/examples/multimodal/model_converter/vision_model_tester.py
+++ b/Megatron-LM/examples/multimodal/model_converter/vision_model_tester.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+import sys
+
+# Add megatron and the multimodal example to the path.
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir)
+    )
+)
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+
+import torch
+from transformers import AutoModel
+
+from examples.multimodal.model import model_provider
+from examples.multimodal.multimodal_args import add_multimodal_extra_args
+from megatron.training import get_model
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def run_mcore_vision(model_path):
+    """Run mcore vision model."""
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
+    # Megatron has some mandatory flags.
+    sys.argv = [
+        "ignore_me.py",
+        "--micro-batch-size=1",
+        "--num-layers=2",
+        "--vision-model-type=internvit",
+        "--language-model-type=mistral_7b",
+        "--tokenizer-prompt-format=mistral",
+        "--tokenizer-type=MultimodalTokenizer",
+        "--tokenizer-model=mistralai/Mistral-7B-Instruct-v0.3",
+        "--vocab-size=1024",
+        "--hidden-size=64",
+        "--num-attention-heads=8",
+        "--seq-length=1024",
+        "--decoder-seq-length=2048",
+        "--max-position-embeddings=2048",
+        "--bf16",
+        "--img-h=448",
+        "--img-w=448",
+        "--patch-dim=14",
+        "--tensor-model-parallel-size=8",
+        "--use-te",
+        f"--pretrained-checkpoint={model_path}",
+    ]
+
+    initialize_megatron(extra_args_provider=add_multimodal_extra_args)
+
+    def wrapped_model_provider(pre_process, post_process):
+        return model_provider(pre_process, post_process, parallel_output=False)
+
+    # Set up model and load checkpoint.
+    model = get_model(wrapped_model_provider, wrap_with_ddp=False)
+
+    vision_model = model[0].module.vision_model
+
+    load_checkpoint([vision_model], None, None)
+
+    vision_model.eval()
+
+    images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
+
+    output = vision_model(images)
+
+    return output
+
+
+def run_hf_vision(model_name):
+    """Run HF vision model."""
+    model = (
+        AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
+        .cuda()
+        .eval()
+    )
+
+    images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
+
+    outputs = model(images, return_dict=True)
+
+    return outputs
+
+
+def main(mcore_model, hf_model):
+    """Compare vision model outputs between mcore and HF given the same fixed input."""
+    mcore = run_mcore_vision(mcore_model)
+
+    if torch.distributed.get_rank() == 0:
+        hf = run_hf_vision(hf_model)
+        hf = hf["last_hidden_state"]
+
+        # Compare logits. Due to different attention implementations and other details,
+        # there will be numerical differences.
+        diff = (mcore - hf).abs()
+        mean_diff = diff.mean().item()
+        max_diff = diff.max().item()
+        print(f"mean diff {mean_diff}, max diff {max_diff}")
+        assert mean_diff < 0.1, "mean output difference is greater than expected"
+        assert max_diff < 50, "max output difference is greater than expected"
+
+        print("lgtm")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check mcore vision model output vs. HF numerically.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--mcore-model", type=str, required=True, help="directory for mcore model weights"
+    )
+    parser.add_argument("--hf-model", type=str, required=True, help="Model name in HF")
+
+    args = parser.parse_args()
+
+    main(args.mcore_model, args.hf_model)
--- a/Megatron-LM/examples/multimodal/multimodal_args.py
+++ b/Megatron-LM/examples/multimodal/multimodal_args.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
+
+
+def add_multimodal_extra_args(parser):
+    """Extra arguments."""
+    group = parser.add_argument_group(title='multimodal arguments')
+    group.add_argument('--dataset-config', type=str, default=None)
+    group.add_argument("--prompt-path", type=str, default=None)
+    group.add_argument('--freeze-LM', action='store_true', default=False)
+    group.add_argument('--freeze-ViT', action='store_true', default=False)
+    group.add_argument('--language-model-type', type=str, required=True)
+    group.add_argument('--vision-model-type', type=str, default="clip")
+    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
+    group.add_argument(
+        "--allow-missing-vision-projection-checkpoint", action="store_true", default=False
+    )
+    group.add_argument("--use-te", action="store_true", default=False)
+    group.add_argument(
+        "--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
+    )
+    group.add_argument(
+        "--use-tiling", action="store_true", default=False, help="Use input image tiling"
+    )
+    group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
+    group.add_argument(
+        "--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
+    )
+    group.add_argument(
+        "--dataloader-seq-length",
+        type=int,
+        help="Make dataloader to produce sequences of specific length.",
+    )
+    group.add_argument(
+        "--num-frames",
+        type=int,
+        default=1,
+        help="Number of frames to regularly sample from the video as input to the model.",
+    )
+    group.add_argument(
+        "--online-evaluation-config", type=str, help="Config file for online evaluation."
+    )
+    group.add_argument(
+        "--special-tokens",
+        nargs="*",
+        default=[IMAGE_TOKEN],
+        help="Special tokens used in the multimodal model",
+    )
+    group.add_argument(
+        "--tokenizer-prompt-format",
+        type=str,
+        choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5", "llama3p1", "nemotron5",
+                 "nemotron5-aligned"],
+        required=True,
+        help="Prompt format to use with the tokenizer.",
+    )
+    group.add_argument("--pixel-shuffle", action="store_true", default=False)
+    group.add_argument(
+        "--image-tag-type",
+        type=str,
+        choices=["nvlm", "internvl", ""],
+        default="",  # Default: Image tag not used.
+        help="Surround image tokens with tags.",
+    )
+    group.add_argument("--use-tile-tags", action="store_true", default=False, help="Use tile tags")
+    group.add_argument(
+        "--packing-buffer-size",
+        type=int,
+        default=None,   # Packing is disabled by default.
+        help="Enable sample packing by setting the buffer size to > 0",
+    )
+    group.add_argument(
+        "--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
+    )
+    group.add_argument(
+        "--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
+    )
+    group.add_argument(
+        "--use-loss-scaling", action="store_true", default=False, help="Scale loss based on conversation turn length (in tokens)."
+    )
+    group.add_argument(
+        "--force-system-message", action="store_true", default=False, help="Force a specific system message"
+    )
+    group.add_argument("--eos-id", type=int, help="termination id for MultiModal Tokenizer")
+    group.add_argument(
+        "--use-area-weighted-aspect-ratio", action="store_true", default=False,
+        help=(
+            "When --use-tiling is True, find the aspect ratio to use based on the original ",
+            "image aspect ratio and the area covered by the tiles.")
+    )
+    group.add_argument("--use-mcore-inference", action="store_true", default=False, help="Use the MCore inference API")
+
+    return parser
--- a/Megatron-LM/examples/multimodal/nvlm/README.md
+++ b/Megatron-LM/examples/multimodal/nvlm/README.md
+NVLM
+====
+
+Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
+
+*NOTE: VLMs in Megatron are under active development and are expected to change.*
+
+# Checkpoints
+
+NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format.
+
+- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B)
+- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore) 
+
+# Setup
+
+## Docker image
+
+Please use `examples/multimodal/Dockerfile`.
+
+## Dataset preparation
+
+Please refer to Tables 4 and 6 in the [NVLM paper](https://arxiv.org/pdf/2409.11402) for full list of pretrain and SFT datasets.
+Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
+
+## Model conversion
+
+### Vision model
+
+NVLM 1.0 models use [OpenGVLab/InternViT-6B-448px-V1-5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V1-5) from HuggingFace.
+Please download it and run the following command to convert it to Megatron format.
+```
+python examples/multimodal/model_converter/internvit_converter.py --output-dir <some output dir> --use-te --tensor-parallel-size 8
+```
+
+### 34B Language model
+
+NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) from HuggingFace.
+Please download it and run the following command to convert it to Megatron format.
+```
+python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
+    --load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model name/directory> \
+    --saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
+```
+
+### 72B Language model
+
+NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) from HuggingFace.
+Please download it and run the following command to convert it to Megatron format.
+```
+python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
+    --load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model name/directory> \
+    --saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
+```
+
+### Combined checkpoint
+
+Combine the vision model checkpoint from [InternVit](#internvit) with the [34B](#34b-language-model) or [72B](#72b-language-model) language model by running:
+```
+examples/multimodal/combine_lm_vision_checkpoints.sh <language model directory> <vision model directory> <output directory> nvlm
+```
+
+# Training
+
+## 34B
+
+1. Pretraining: please run `examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh`. Please use the InternViT + 34B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
+2. SFT: please run `examples/multimodal/nvlm/sft_34b_internvit.sh` using the checkpoint from 1.
+
+## 72B
+
+1. Pretraining: please run `examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh`. Please use the InternViT + 72B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
+2. Convert the pretraining checkpoint from 1. to have pipeline parallel size = 4 for SFT. Please run
+```
+examples/multimodal/nvlm/pp_checkpoint_converter.py --input <pretrained checkpoint directory> \
+--input-pipeline-parallel 1 --output <some output dir> --output-pipeline-parallel 4 \
+--tensor-parallel 8
+```
+3. SFT: please run `examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh` using the checkpoint from 2.
+4. To convert the checkpoint with pipeline parallel size = 4 back to 1 for evaluation, please run
+```
+examples/multimodal/nvlm/pp_checkpoint_converter.py --input <sft checkpoint directory> \
+--input-pipeline-parallel 4 --output <some output dir> --output-pipeline-parallel 1 \
+--tensor-parallel 8
+```
+
+# Evaluation
+
+Run the text generation script.
+- 34B
+```
+examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
+    --model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
+```
+- 72B
+```
+examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
+    --model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
+```
+
+where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning`, `MMMU` or `TextVQA`.
+
+Then, run one of the evaluation scripts from `examples/multimodal`. For example
+
+```
+python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
+```
--- a/Megatron-LM/examples/multimodal/nvlm/internvit.py
+++ b/Megatron-LM/examples/multimodal/nvlm/internvit.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+""""
+NOTE: NVLM uses InternViT with tensor parallel (TP) size = 8.
+Since InternViT has 25 attention heads and Megatron currently requires the number of attention heads
+to be divisible by the TP size, we add 7 dummy zero attention heads to have 32 attention heads.
+
+This workaround requires some changes to how we compute RMSNorm, Attention etc.
+
+Additionally, InternViT introduces some unique features like Layer Scaling.
+
+Those code changes are gathered here.
+"""
+from functools import partial
+
+import torch
+
+from megatron.core.utils import divide
+from megatron.core.extensions.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TERowParallelLinear,
+)
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
+
+from examples.multimodal.layer_scaling import LayerScalingTransformerLayer, get_bias_dropout_add_layer_scaling
+
+
+try:
+    import apex
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
+    LNImpl = WrappedTorchNorm
+
+
+class InternViTRMSNorm(MegatronModule):
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        eps: float = 1e-6,
+        sequence_parallel: bool = False,
+        compute_var: bool = False,
+    ):
+        """Custom RMSNorm for InternViT.
+
+        Args:
+            config (TransformerConfig): Config.
+            hidden_size (int): Input hidden size.
+            eps (float): epsilon to use for the norm, default to 1e-6
+            sequence_parallel (bool): Set to true if sequence parallelism is being used,
+              this marks the weights as needing to be allreduced.
+            compute_var (bool): Indicator to compute statistic manually.
+        """
+        super().__init__(config=config)
+        self.config = config
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
+        self._compute_var = compute_var
+
+        assert not sequence_parallel, "Sequence parallelism is not supported with InternViT."
+
+        setattr(self.weight, 'sequence_parallel', sequence_parallel)
+
+    def _norm(self, x, var):
+        if var is None:
+            var = x.pow(2).mean(-1, keepdim=True)
+
+        return x * torch.rsqrt(var + self.eps)
+
+    def forward(self, x):
+        """Run RMSNorm with an option to compute custom statistic."""
+        var = None
+        if self._compute_var:
+            unpadded_hidden_size = self.config.hidden_size  # 3200
+            max_dim = x.shape[-1]  # 128
+
+            x = x.reshape(x.size(0), x.size(1), -1)
+            var = self._gather_var(x.float().pow(2), max_dim) / unpadded_hidden_size
+
+        output = self._norm(x.float(), var).type_as(x)
+        output = output * self.weight
+
+        if self._compute_var:
+            output = output.reshape(output.size(0), output.size(1), -1, max_dim)
+
+        return output
+
+    def _gather_var(self, input_, max_dim):
+        """Compute statistic across the non-dummy heads."""
+        world_size = get_tensor_model_parallel_world_size()
+
+        # Size and dimension.
+        last_dim = input_.dim() - 1
+        rank = get_tensor_model_parallel_rank()
+
+        num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
+        valid_ranks = 24 // num_attention_heads_per_partition
+
+        residual_heads = 25 % num_attention_heads_per_partition
+        if residual_heads == 0:
+            residual_heads = num_attention_heads_per_partition
+        max_dim = max_dim * residual_heads
+
+        if rank < valid_ranks:  # Ranks without any dummy attention heads.
+            var = input_.sum(-1, keepdim=True)
+        elif rank == valid_ranks:  # The only rank which may contain 'residual_heads' dummy attention heads.
+            var = input_[..., :max_dim].sum(-1, keepdim=True)
+        else:
+            var = input_.sum(-1, keepdim=True) * 0.0  # All heads in these ranks are dummy heads: Zero-out.
+
+        tensor_list = [torch.empty_like(var) for _ in range(world_size)]
+        tensor_list[rank] = var
+        torch.distributed.all_gather(tensor_list, var, group=get_tensor_model_parallel_group())
+
+        output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+        return output.sum(-1, keepdim=True)
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}):
+
+        # in InternVitSelfAttention the q_layernorm and k_layernorm weights
+        # are tensor-parallel so must be converted to sharded tensors
+        if 'q_layernorm' in prefix or 'k_layernorm' in prefix:
+            state_dict = self.state_dict(prefix='', keep_vars=True)
+            return make_sharded_tensors_for_checkpoint(
+                state_dict, prefix, {'weight': 0}, sharded_offsets
+            )
+        else:
+            return super().sharded_state_dict(prefix, sharded_offsets, metadata)
+
+
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+
+
+# Override a few things that are special in InternViT and not supported by the SelfAttention class.
+class InternViTSelfAttention(SelfAttention):
+    def __init__(
+        self, config: TransformerConfig, submodules: SelfAttentionSubmodules, *args, **kwargs
+    ):
+        super().__init__(config=config, submodules=submodules, *args, **kwargs)
+
+        # Need to override linear_qkv, q_layernorm and k_layernorm.
+        qkv_bias = False
+
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='qkv',
+        )
+
+        qk_layernorm_hidden_size = (
+            self.hidden_size_per_attention_head * self.num_attention_heads_per_partition
+        )  # 512 for internvit
+
+        self.q_layernorm = build_module(
+            submodules.q_layernorm,
+            hidden_size=qk_layernorm_hidden_size,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+            compute_var=True,
+        )
+
+        self.k_layernorm = build_module(
+            submodules.k_layernorm,
+            hidden_size=qk_layernorm_hidden_size,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+            compute_var=True,
+        )
+
+
+class InternViTTEDotProductAttention(TEDotProductAttention):
+    """Adjusted Attention for InternViT"""
+
+    def forward(self, *args, **kwargs):
+        """Regular TEDotProductAttention + zero-out dummy attention heads."""
+        out = super().forward(*args, **kwargs)
+
+        # This makes sure the dummy attention heads are zeroed out.
+        mask = torch.ones_like(out, dtype=out.dtype, device=out.device)
+        rank = get_tensor_model_parallel_rank()
+        max_dim = out.shape[-1]  # 128
+        valid_ranks = 6
+
+        if rank == valid_ranks:
+            mask[..., max_dim:] *= 0.0
+        elif rank > valid_ranks:
+            mask *= 0.0
+        out *= mask
+
+        return out
+
+
+def get_internvit_layer_spec(use_te) -> ModuleSpec:
+    mlp = get_mlp_module_spec(use_te)  # no norm
+
+    return ModuleSpec(
+        module=LayerScalingTransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=InternViTRMSNorm,
+            self_attention=ModuleSpec(
+                module=InternViTSelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+                    core_attention=TEDotProductAttention if use_te else DotProductAttention,
+                    linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
+                    q_layernorm=InternViTRMSNorm,
+                    k_layernorm=InternViTRMSNorm,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add_layer_scaling,
+            pre_mlp_layernorm=InternViTRMSNorm,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add_layer_scaling,
+        ),
+    )
+
+def get_internvit300M_layer_spec(use_te) -> ModuleSpec:
+    mlp = get_mlp_module_spec(use_te)  # no norm
+
+    return ModuleSpec(
+        module=LayerScalingTransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=LNImpl,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+                    core_attention=TEDotProductAttention if use_te else DotProductAttention,
+                    linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
+                    q_layernorm=None,
+                    k_layernorm=None,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add_layer_scaling,
+            pre_mlp_layernorm=LNImpl,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add_layer_scaling,
+        ),
+    )
--- a/Megatron-LM/examples/multimodal/nvlm/nvlm_prompts.json
+++ b/Megatron-LM/examples/multimodal/nvlm/nvlm_prompts.json
+{
+    "COMMENT": "Mixture of our own custom prompts and some prompts from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
+    "Captioning": {
+        "raw": [
+        "Can you briefly explain what you see in the image?",
+        "Describe what's happening in this image in one short sentence.",
+        "Write a short caption that accurately represents the content of this image.",
+        "Please generate a descriptive caption for the image provided.",
+        "How would you summarize the scene depicted in the picture in short?",
+        "Describe the image briefly.",
+        "Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
+        "Create a concise caption that accurately describes the main elements in the image provided.",
+        "Write a brief, yet comprehensive, description of the image.",
+        "Describe the image in a clear and concise manner.",
+        "For the given image, provide a one-sentence summary that captures the most important details.",
+        "Generate a short caption for the picture.",
+        "Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
+        "Provide a concise and informative caption for the image, focusing on the primary subjects.",
+        "Write a clear description of the image, make sure the key features are well covered.",
+        "Offer a succinct explanation of the picture presented."
+        ]
+    },
+    "CaptioningPretraining": {
+        "raw": [
+        "Give a brief description of image.",
+        "Give a brief description of the image.",
+        "Provide a brief description of the given image.",
+        "Provide a one-sentence caption for the provided image.",
+        "Write a terse but informative summary of the picture.",
+        "Describe the image concisely.",
+        "Generate a clear and concise summary of the photo."
+        ]
+    },
+    "CaptioningSFT": {
+        "raw": [
+        "Give a brief description of the image.",
+        "Give a short and clear explanation of the subsequent image.",
+        "Present a compact description of the photo's key features.",
+        "Provide a brief description of the given image.",
+        "Provide a one-sentence caption for the provided image.",
+        "Render a clear and concise summary of the photo.",
+        "Share a concise interpretation of the image provided.",
+        "Summarize the visual content of the image.",
+        "Write a terse but informative summary of the picture.",
+        "Describe the image concisely."
+        ]
+    },
+    "VQAPretraining": {
+        "raw": [
+        "Question: {} Short answer:",
+        "Question: {} Answer:"
+        ]
+    },
+    "VQASFT": {
+        "raw": [
+        "{}",
+        "{}\nAnswer the question using a single word or phrase."
+        ],
+        "docvqa": [
+        "{}",
+        "{}\nAnswer this question using the text in the image directly."
+        ]
+    },
+    "DocPretraining": {
+        "raw": [
+        "Retrieve the text from the given pdf image.",
+        "Extract the text from the provided document.",
+        "Transcribe the text displayed in the image."
+        ],
+        "ocr_multi": [
+        "Apply grounded Optical Character Recognition (OCR) to the provided image.",
+        "Extract all texts and their bounding boxes from the given image using grounded OCR.",
+        "Extract and transcribe all visible text from the provided image, ensuring accurate spatial recognition.",
+        "Conduct a detailed optical character recognition analysis on this image, maintaining the text's original layout and positioning.",
+        "Execute a thorough text recognition procedure on this visual input, ensuring that the spatial arrangement of the text is accurately represented.",
+        "Perform an in-depth OCR scan of the image, capturing both the content and contextual positioning of all textual information.",
+        "OCR with grounding:"
+        ],
+        "md": [
+        "Extract the text from the given image and format it in Markdown.",
+        "Convert the text from the provided image into Markdown format.",
+        "Transform the text from the given image into Markdown syntax.",
+        "Extract and convert the text from the image to Markdown.",
+        "Retrieve the text from the image and present it in Markdown format."
+        ],
+        "grounded_ocr": [
+        "{}. Text:",
+        "Recognize the text in this region: {}.",
+        "Identify the text in this area: {}.",
+        "Detect the text within this section: {}."
+        ],
+        "referring_grounding": [
+        "Region of \"{}\" is:",
+        "Locate the text \"{}\" in the image.",
+        "Identify the text \"{}\" in the image and provide the coordinates."
+        ]
+    },
+    "CaptioningDetailed": {
+        "raw": [
+        "Create a comprehensive paragraph that captures the essence of the image while weaving a cohesive narrative around its elements.",
+        "Compose a paragraph that thoroughly describes the image's content, providing context and connections between different aspects of the scene.",
+        "Provide a detailed, paragraph-length description of the image that paints a vivid picture and tells a coherent story.",
+        "Write a rich and engaging paragraph that delves into the image's components, describing not only what is seen but also how the elements relate to one another.",
+        "Give a well-rounded, paragraph-length explanation of the image, describing the scene and its components while forming a complete and engaging narrative.",
+        "Produce a paragraph that not only describes the individual elements in the image but also weaves them together to form a cohesive, connected account.",
+        "Construct a paragraph that captures the image's details and context, offering a more in-depth and engaging story than a simple caption.",
+        "Compose a descriptive paragraph that brings the image to life through detailed storytelling, connecting the various visual elements into a unified narrative.",
+        "Create a paragraph that provides an extensive and interconnected description of the image, ensuring that the narrative is both detailed and cohesive.",
+        "Write a compelling and detailed paragraph that delves into the image's components, linking them together to create a unified and engaging story."
+        ]
+    },
+    "OCR": {
+        "raw": [
+        "Can you read the text from image and output here?",
+        "Extract and document the text from the provided image.",
+        "Converting the text embedded in this image into a readable document.",
+        "Transcribe all the text you find.",
+        "Can you extract all visible text from the image here?"
+        ],
+        "markdown": [
+        "Can you extract all visible text from the provided image?",
+        "Converting the text embedded in this image into a readable markdown document.",
+        "Can you read the text in the document as markdown?",
+        "Transcribe the document as markdown.",
+        "Extract and document the text from the provided image."
+        ],
+        "table_markdown": [
+        "Can you extract all visible text from the provided table?",
+        "Can you read the text in the provided table as markdown?",
+        "Transcribe the table as markdown.",
+        "Extract and document the text from the provided table image."
+        ],
+        "plain": [
+        "Transcribe the document as plain text.",
+        "Extract and document the text from the provided image.",
+        "Converting the text embedded in this image into a readable document.",
+        "Transcribe all the text you find.",
+        "Can you extract all visible text from the image here?"
+        ],
+        "bbox_plain": [
+        "Transcribe the document as plain text along with bounding boxes.",
+        "Extract and document the text from the provided image along with bounding boxes.",
+        "Converting the text embedded in this image into a readable documen along with bounding boxes.",
+        "Can you extract all visible text with bounding boxes from the image here?"
+        ]
+    },
+    "VQA": {
+        "raw": [
+        "Given the image, answer the following question with few words.",
+        "Answer the following question: ",
+        "What is the answer to this question?",
+        "Write the answer: ",
+        "Please answer this question: "
+        ]
+    },
+    "Embedded": {
+        "raw": [
+        "Given the image, answer the following question with few words.",
+        "Answer the following question: ",
+        "What is the answer to this question?",
+        "Write the answer: ",
+        "Please answer this question: "
+        ]
+    }
+}
--- a/Megatron-LM/examples/multimodal/nvlm/pp_checkpoint_converter.py
+++ b/Megatron-LM/examples/multimodal/nvlm/pp_checkpoint_converter.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import argparse
+import os
+import sys
+
+import torch
+
+# Add megatron to the path.
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir))
+)
+
+
+def split(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
+    """Split pipeline parallel size = 1 checkpoint to pipeline parallel size N."""
+
+    iter = args.iteration if args.iteration else 1
+    for tp in range(num_tp):
+        path = os.path.join(input_dir, f"mp_rank_0{tp}", "model_optim_rng.pt")
+        sd = torch.load(path)
+
+        if num_layers_per_pp_rank is None:
+            num_layers = sd["args"].num_layers
+            assert num_layers % output_pp == 0, "specify --num-layers-per-pp-rank for an uneven split"
+            num_layers_per_pp_rank = [num_layers // output_pp] * output_pp
+
+        layer_lb = 0
+        for pp in range(output_pp):
+            assert num_layers_per_pp_rank[pp] > 0, "each pp rank must have at least 1 layer"
+            layer_ub = layer_lb + num_layers_per_pp_rank[pp]
+
+            new_sd = sd.copy()
+            new_sd["model"] = dict()
+            for k, v in sd["model"].items():
+                # First pp rank has vision model.
+                if pp == 0 and ("vision_model" in k or "vision_projection" in k):
+                    new_sd["model"][k] = v
+                    continue
+
+                # Only the first pp rank has the word embeddings.
+                if "language_model.embedding.word_embeddings" in k and pp == 0:
+                    new_sd["model"][k] = v
+
+                # Only the last pp rank has the output layer.
+                if "language_model.output_layer" in k and pp == output_pp - 1:
+                    new_sd["model"][k] = v
+
+                # Only the last pp rank has final layer norm.
+                if pp == output_pp - 1 and (
+                    "language_model.decoder.final_norm" in k  # Mamba model
+                    or "language_model.decoder.final_layernorm" in k  # GPT model
+                ):
+                    new_sd["model"][k] = v
+
+                if "language_model.decoder.layers" in k:
+                    layer_num = int(k.split(".")[3])
+
+                    if layer_lb <= layer_num and layer_num < layer_ub:
+                        # On all pp ranks, megatron starts layer nums from 0!
+                        new_layer_num = int(layer_num - layer_lb)
+
+                        k_splitted = k.split(".")
+                        k_splitted[3] = str(new_layer_num)
+                        new_k = ".".join(k_splitted)
+
+                        new_sd["model"][new_k] = v
+
+            output_dir = os.path.join(base_output_dir, f"iter_{iter:0>7}/mp_rank_0{tp}_00{pp}")
+            os.makedirs(output_dir, exist_ok=True)
+            output_path = os.path.join(output_dir, "model_optim_rng.pt")
+            torch.save(new_sd, output_path)
+
+            print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{output_pp - 1}")
+
+            layer_lb = layer_ub
+
+    # This is needed for megatron checkpoint loading.
+    with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
+        f.write(f"{iter}")
+
+
+def combine(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
+    """Combine pipeline parallel size = N checkpoint to pipeline parallel size 1."""
+    iter = args.iteration if args.iteration else 1
+    for tp in range(num_tp):
+        new_sd = None
+
+        layer_num_offset = 0
+        max_layer_num = 0
+
+        for pp in range(input_pp):
+            path = os.path.join(input_dir, f"mp_rank_0{tp}_00{pp}", "model_optim_rng.pt")
+            sd = torch.load(path)
+
+            if pp == 0:
+                new_sd = sd.copy()
+                new_sd["model"] = dict()
+                new_sd["args"].pipeline_model_parallel_size = 1
+
+            assert new_sd is not None
+
+            for k, v in sd["model"].items():
+                # First pp rank has vision model.
+                if pp == 0 and ("vision_model" in k or "vision_projection" in k):
+                    new_sd["model"][k] = v
+                    continue
+
+                # Only the first pp rank has the word embeddings.
+                if "language_model.embedding.word_embeddings" in k and pp == 0:
+                    new_sd["model"][k] = v
+
+                # Only the last pp rank has the output layer.
+                if "language_model.output_layer" in k and pp == input_pp - 1:
+                    new_sd["model"][k] = v
+
+                # Only the last pp rank has final layer norm.
+                if pp == output_pp - 1 and (
+                    "language_model.decoder.final_norm" in k  # Mamba model
+                    or "language_model.decoder.final_layernorm" in k  # GPT model
+                ):
+                    new_sd["model"][k] = v
+
+                if "language_model.decoder.layers" in k:
+                    layer_num = int(k.split(".")[3])
+
+                    # On all pp ranks, megatron starts layer nums from 0!
+                    new_layer_num = layer_num_offset + layer_num
+
+                    if new_layer_num > max_layer_num:
+                        max_layer_num = new_layer_num
+
+                    k_splitted = k.split(".")
+                    k_splitted[3] = str(new_layer_num)
+                    new_k = ".".join(k_splitted)
+
+                    new_sd["model"][new_k] = v
+
+            print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{input_pp - 1}")
+
+            layer_num_offset = max_layer_num + 1
+
+        output_dir = os.path.join(base_output_dir, f"iter_{iter:0>7}/mp_rank_0{tp}")
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "model_optim_rng.pt")
+        torch.save(new_sd, output_path)
+
+    # This is needed for megatron checkpoint loading.
+    with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
+        f.write(f"{iter}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Change pipeline parallelism for a model",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--input", type=str, required=True, help="Input model directory"
+    )
+    parser.add_argument(
+        "--input-pipeline-parallel", type=int, required=True, help="Input model pipeline parallelism"
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="Output model directory"
+    )
+    parser.add_argument(
+        "--output-pipeline-parallel", type=int, required=True, help="Output model pipeline parallelism"
+    )
+    parser.add_argument(
+        "--tensor-parallel", type=int, required=True, help="Model tensor parallel size",
+    )
+    parser.add_argument(
+        "--num-layers-per-pp-rank", type=int, default=None, nargs="*", help="Specify this for uneven pipeline parallel split",
+    )
+    parser.add_argument(
+        "--iteration", type=int, default=None, help="Specify checkpoint iteration",
+    )
+
+    args = parser.parse_args()
+
+    f = None
+    if args.input_pipeline_parallel == 1 and args.output_pipeline_parallel > 1:
+        f = split
+    elif args.input_pipeline_parallel > 1 and args.output_pipeline_parallel == 1:
+        f = combine
+    else:
+        raise NotImplementedError("Only pipeline parallel 1 to N and N to 1 are supported")
+
+    f(args.input, args.output, args.input_pipeline_parallel, args.output_pipeline_parallel, args.tensor_parallel, args.num_layers_per_pp_rank)
+
+    print("done.")
--- a/Megatron-LM/examples/multimodal/nvlm/pretrain_blend.yaml
+++ b/Megatron-LM/examples/multimodal/nvlm/pretrain_blend.yaml
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 0.579   # Datasets are weighted according to their size. Weights sum up to 1.
+        path: <path to laion dataset>
+        subflavors:
+          augmentation: False
+
+      - weight: 0.02
+        path: <path to coco>
+        subflavors:
+          augmentation: False
+
+      - weight: 0.01
+        path: <path to vqav2 dataset>
+        subflavors:
+          augmentation: False
+
+      # Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets.
+      # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
+  val:
+    datasets:
+      - weight: 1.
+        path: <path to validation dataset>
+        subflavors:
+          augmentation: False
--- a/Megatron-LM/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
+#!/bin/bash
+
+# Your SBATCH commands here if using SLURM.
+
+# Please launch this script from megatron-lm root.
+
+# Train a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM="false"
+
+DEBUG=0
+
+if [[ $BATCH -eq 0 ]]; then
+    DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+    MODEL_NAME="mcore-qwen20-72b-internvit-${DATETIME}"
+else
+    MODEL_NAME="mcore-qwen20-72b-internvit"
+fi
+
+WORKSPACE="<some dir>"
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+CHECKPOINT_DIR="${WORKSPACE}/combined-qwen2.0-72b-instruct-internvit-6b-448px-1.5-tp8-te"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
+
+if [[ $DEBUG -eq 1 ]]; then
+    MBZ=1
+    BZ=1
+    NW=0
+    AD=0.0
+    HD=0.0
+    LI=1
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+else
+    MBZ=1
+    BZ=2048
+    NW=8
+    AD=0.1
+    HD=0.1
+    LI=5
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+fi
+
+SEQ_LEN=256     # Image embeddings sequence length.
+DECODER_SEQ_LEN=512     # Language model sequence length.
+MAX_POS_EMBED=512
+
+
+OPTIONS=" \
+    --use-checkpoint-args \
+    --exit-duration-in-mins 230 \
+    --disable-bias-linear \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model Qwen/Qwen2-72B-Instruct \
+    --tokenizer-prompt-format qwen2p0 \
+    --transformer-impl transformer_engine \
+    --normalization RMSNorm \
+    --norm-epsilon 1e-06 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --attention-dropout ${AD} \
+    --hidden-dropout ${HD} \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --swiglu \
+    --tensor-model-parallel-size 8  \
+    --pipeline-model-parallel-size 1  \
+    --num-layers 80 \
+    --hidden-size 8192 \
+    --ffn-hidden-size 29568 \
+    --add-qkv-bias \
+    --num-attention-heads 64  \
+    --use-distributed-optimizer \
+    --use-te \
+    --num-workers ${NW} \
+    --seq-length ${SEQ_LEN} \
+    --decoder-seq-length ${DECODER_SEQ_LEN} \
+    --max-position-embeddings 32768 \
+    --train-samples 122880000 \
+    --lr-decay-samples 25600000 \
+    --lr-warmup-samples 83200 \
+    --micro-batch-size ${MBZ} \
+    --global-batch-size ${BZ} \
+    --lr 1e-4 \
+    --min-lr 2.5e-5 \
+    --lr-decay-style cosine \
+    --log-interval ${LI} \
+    --eval-iters 10 \
+    --eval-interval 500 \
+    --data-path ${DATA_TRAIN} \
+    --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
+    --save-interval 5000 \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --split 100,0,0 \
+    --clip-grad 10.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --bf16 \
+    --eod-mask-loss \
+    --freeze-ViT \
+    --freeze-LM \
+    --patch-dim 14 \
+    --img-h 448 \
+    --img-w 448 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type qwen2.0_72B \
+    ${EXTRA_ARGS} \
+    --allow-missing-vision-projection-checkpoint \
+    --vision-model-type internvit \
+    --disable-vision-class-token \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --ckpt-format torch \
+    --pixel-shuffle \
+    --image-tag-type nvlm
+"
+
+
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
+
+# Interactive or batch mode
+if [[ $BATCH -eq 0 ]]; then
+    torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
+else
+    run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
+
+    DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+
+    srun -l --verbose \
+    --container-image <path to docker image> \
+    --container-mounts "<some mount>" \
+    --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
+    sh -c "${run_cmd}"
+
+    set +x
+fi
--- a/Megatron-LM/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
+#!/bin/bash
+
+# Your SBATCH commands here if using SLURM.
+
+# Please launch this script from megatron-lm root.
+
+# Train a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM="false"
+
+DEBUG=0
+
+if [[ $BATCH -eq 0 ]]; then
+    DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+    MODEL_NAME="mcore-nous-yi34b-internvit-mlp-${DATETIME}"
+else
+    MODEL_NAME="mcore-nous-yi34b-internvit-mlp"
+fi
+
+WORKSPACE="<some dir>"
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+LOAD_NAME="combined-yi-34b-internvit-tp8-mcore"
+CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
+
+
+if [[ $DEBUG -eq 1 ]]; then
+    MBZ=1
+    BZ=1
+    NW=0
+    LI=1
+    AD=0.0
+    HD=0.0
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+else
+    MBZ=1
+    BZ=2048
+    NW=8
+    LI=5
+    AD=0.1
+    HD=0.1
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+fi
+
+SEQ_LEN=256     # Image embeddings sequence length.
+DECODER_SEQ_LEN=512     # Language model sequence length.
+MAX_POS_EMBED=512
+
+
+OPTIONS=" \
+    --swiglu \
+    --use-distributed-optimizer \
+    --num-workers ${NW} \
+    --num-layers 60 \
+    --hidden-size 7168 \
+    --normalization RMSNorm \
+    --num-attention-heads 56 \
+    --exit-duration-in-mins 230 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 20480 \
+    --seq-length ${SEQ_LEN} \
+    --decoder-seq-length ${DECODER_SEQ_LEN} \
+    --max-position-embeddings ${MAX_POS_EMBED} \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
+    --tokenizer-prompt-format nvlm-yi-34b \
+    --vocab-size 64000 \
+    --make-vocab-size-divisible-by 1 \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 5000000 \
+    --disable-bias-linear \
+    --tensor-model-parallel-size 8 \
+    --language-model-type yi-34b \
+    --vision-model-type internvit \
+    --micro-batch-size ${MBZ} \
+    --global-batch-size ${BZ} \
+    --train-samples 122880000 \
+    --lr-decay-samples 25600000 \
+    --lr-warmup-samples 83200 \
+    --lr 1e-4 \
+    --min-lr 2.5e-5 \
+    --lr-decay-style cosine \
+    --clip-grad 10.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --attention-dropout ${AD} \
+    --hidden-dropout ${HD} \
+    --untie-embeddings-and-output-weights \
+    --eod-mask-loss \
+    --bf16 \
+    --tensorboard-dir=${TENSORBOARD_DIR} \
+    --freeze-LM \
+    --freeze-ViT \
+    --img-h 448 \
+    --img-w 448 \
+    --patch-dim 14 \
+    --data-path ${DATA_TRAIN} \
+    --dataloader-type external \
+    --split 100,0,0 \
+    --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
+    --log-interval ${LI} \
+    --save-interval 2000 \
+    --eval-interval 500 \
+    --eval-iters 10 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    ${EXTRA_ARGS} \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --allow-missing-vision-projection-checkpoint \
+    --disable-vision-class-token \
+    --use-te \
+    --use-checkpoint-args \
+    --ckpt-format torch \
+    --pixel-shuffle \
+    --image-tag-type nvlm
+    "
+
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+# Interactive or batch mode
+if [[ $BATCH -eq 0 ]]; then
+    torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
+else
+    run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
+
+    DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+
+    srun -l --verbose \
+    --container-image <path to docker image> \
+    --container-mounts "<some mount>" \
+    --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
+    sh -c "${run_cmd}"
+
+    set +x
+fi
--- a/Megatron-LM/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export TOKENIZERS_PARALLELISM="false"
+
+INPUT_IMAGE_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
+
+USE_TILING=0
+USE_PIXEL_SHUFFLE_ONLY=0
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        --task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        --use-tiling)
+            USE_TILING=1
+            shift
+            shift
+            ;;
+        --use-pixel-shuffle-only)
+            USE_PIXEL_SHUFFLE_ONLY=1
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+
+SEQ_LEN=1024     # Image embeddings sequence length.
+DECODER_SEQ_LEN=8192    # Language model sequence length.
+MAX_POS_EMBED=8192
+
+# Additional arguments.
+EXTRA_ARGS=""
+
+if [[ $USE_TILING -eq 1 ]]; then
+    EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 6 --use-thumbnail --use-tile-tags"
+    SEQ_LEN=261     # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
+fi
+
+if [[ $USE_PIXEL_SHUFFLE_ONLY -eq 1 ]]; then
+    EXTRA_ARGS+=" --pixel-shuffle"
+    SEQ_LEN=256
+fi
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --attention-softmax-in-fp32 \
+        --no-masked-softmax-fusion \
+        --swiglu \
+        --num-layers 80 \
+        --hidden-size 8192 \
+        --normalization RMSNorm \
+        --norm-epsilon 1e-06 \
+        --num-attention-heads 64 \
+        --exit-on-missing-checkpoint \
+        --group-query-attention \
+        --num-query-groups 8 \
+        --ffn-hidden-size 29568 \
+        --load ${MODEL_PATH} \
+        --seq-length ${SEQ_LEN} \
+        --decoder-seq-length ${DECODER_SEQ_LEN} \
+        --max-position-embeddings ${MAX_POS_EMBED} \
+        --tokenizer-type MultimodalTokenizer \
+        --tokenizer-model Qwen/Qwen2-72B-Instruct \
+        --tokenizer-prompt-format qwen2p0 \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 1000000 \
+        --disable-bias-linear \
+        --add-qkv-bias \
+        --tensor-model-parallel-size 8 \
+        --pipeline-model-parallel-size 1 \
+        --language-model-type qwen2.0_72B \
+        --vision-model-type internvit \
+        --micro-batch-size 1 \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --bf16 \
+        --freeze-LM \
+        --freeze-ViT \
+        --img-h 448 \
+        --img-w 448 \
+        --patch-dim 14 \
+        --use-te \
+        --transformer-impl transformer_engine \
+        --use-checkpoint-args \
+        --out-seq-length 16 \
+        --temperature 1.0 \
+        --patch-dim 14 \
+        --seed 1234 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --disable-vision-class-token \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        ${EXTRA_ARGS} \
+        --task ${TASK} \
+        --image-tag-type nvlm \
+        --ckpt-format torch
+done
--- a/Megatron-LM/examples/multimodal/nvlm/run_text_generation_qwen25_7b_internvit_video.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/run_text_generation_qwen25_7b_internvit_video.sh
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export TOKENIZERS_PARALLELISM="false"
+
+INPUT_IMAGE_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        --input-metadata-path)
+            INPUT_METADATA_PATH="$2"
+            shift
+            shift
+            ;;
+        --num-frames)
+            NUM_FRAMES="$2"
+            shift
+            shift
+            ;;
+        -g|--groundtruth-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        --task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+
+SEQ_LEN=256
+DECODER_SEQ_LEN=16384
+
+EXTRA_ARGS=" --pixel-shuffle"
+
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --attention-softmax-in-fp32 \
+        --transformer-impl transformer_engine \
+        --use-te \
+        --use-checkpoint-args \
+        --normalization RMSNorm \
+        --norm-epsilon 1e-06 \
+        --language-model-type=qwen2.5_7B \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 1000000 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --tensor-model-parallel-size 4 \
+        --pipeline-model-parallel-size 1 \
+        --group-query-attention \
+        --num-query-groups 4 \
+        --num-layers 28 \
+        --hidden-size 3584 \
+        --ffn-hidden-size 18944 \
+        --add-qkv-bias \
+        --num-attention-heads 28 \
+        --max-position-embeddings 32768  \
+        --no-masked-softmax-fusion \
+        --load ${MODEL_PATH} \
+        --tokenizer-type MultimodalTokenizer \
+        --tokenizer-model Qwen/Qwen2.5-7B-Instruct \
+        --tokenizer-prompt-format qwen2p5 \
+        --bf16 \
+        --micro-batch-size 1 \
+        --seq-length ${SEQ_LEN} \
+        --decoder-seq-length ${DECODER_SEQ_LEN} \
+        --out-seq-length 128 \
+        --temperature 1.0 \
+        --img-h 448 \
+        --img-w 448 \
+        --patch-dim 14 \
+        --seed 153 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --task ${TASK} \
+        ${EXTRA_ARGS} \
+        --special-tokens "<image>" "<img>" "</img>" \
+        --vision-model-type internvit \
+        --num-frames ${NUM_FRAMES} \
+        --ckpt-format torch
+done
--- a/Megatron-LM/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export TOKENIZERS_PARALLELISM="false"
+
+INPUT_IMAGE_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i|--input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        -t|--task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+
+
+SEQ_LEN=256
+DECODER_SEQ_LEN=8192
+EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --attention-softmax-in-fp32 \
+        --transformer-impl transformer_engine \
+        --use-te \
+        --use-checkpoint-args \
+        --normalization RMSNorm \
+        --norm-epsilon 1e-06 \
+        --language-model-type=qwen2.5_7B \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 1000000 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --tensor-model-parallel-size 4 \
+        --pipeline-model-parallel-size 1 \
+        --group-query-attention \
+        --num-query-groups 4 \
+        --num-layers 28 \
+        --hidden-size 3584 \
+        --ffn-hidden-size 18944 \
+        --add-qkv-bias \
+        --num-attention-heads 28 \
+        --max-position-embeddings 32768  \
+        --no-masked-softmax-fusion \
+        --load ${MODEL_PATH} \
+        --tokenizer-type MultimodalTokenizer \
+        --tokenizer-model Qwen/Qwen2.5-7B-Instruct \
+        --tokenizer-prompt-format qwen2p5 \
+        --bf16 \
+        --micro-batch-size 1 \
+        --seq-length ${SEQ_LEN} \
+        --decoder-seq-length ${DECODER_SEQ_LEN} \
+        --out-seq-length 128 \
+        --temperature 1.0 \
+        --img-h 448 \
+        --img-w 448 \
+        --patch-dim 14 \
+        --seed 153 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --task ${TASK} \
+        ${EXTRA_ARGS} \
+        --special-tokens "<image>" "<img>" "</img>" \
+        --vision-model-type siglip \
+        --ckpt-format torch
+done
--- a/Megatron-LM/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+INPUT_IMAGE_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
+
+USE_TILING=0
+USE_PIXEL_SHUFFLE_ONLY=0
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        --task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        --use-tiling)
+            USE_TILING=1
+            shift
+            shift
+            ;;
+        --use-pixel-shuffle-only)
+            USE_PIXEL_SHUFFLE_ONLY=1
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+
+SEQ_LEN=1024     # Image embeddings sequence length.
+DECODER_SEQ_LEN=8192    # Language model sequence length.
+MAX_POS_EMBED=8192
+
+# Additional arguments.
+EXTRA_ARGS=""
+
+if [[ $USE_TILING -eq 1 ]]; then
+    EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 6 --use-thumbnail --use-tile-tags"
+    SEQ_LEN=261     # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
+fi
+
+if [[ $USE_PIXEL_SHUFFLE_ONLY -eq 1 ]]; then
+    EXTRA_ARGS+=" --pixel-shuffle"
+    SEQ_LEN=256
+fi
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --attention-softmax-in-fp32 \
+        --no-masked-softmax-fusion \
+        --swiglu \
+        --num-layers 60 \
+        --hidden-size 7168 \
+        --normalization RMSNorm \
+        --num-attention-heads 56 \
+        --exit-on-missing-checkpoint \
+        --group-query-attention \
+        --num-query-groups 8 \
+        --ffn-hidden-size 20480 \
+        --load ${MODEL_PATH} \
+        --seq-length ${SEQ_LEN} \
+        --decoder-seq-length ${DECODER_SEQ_LEN} \
+        --max-position-embeddings ${MAX_POS_EMBED} \
+        --tokenizer-type MultimodalTokenizer \
+        --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
+        --tokenizer-prompt-format nvlm-yi-34b \
+        --vocab-size 64000 \
+        --make-vocab-size-divisible-by 1 \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 5000000 \
+        --disable-bias-linear \
+        --tensor-model-parallel-size 8 \
+        --pipeline-model-parallel-size 1 \
+        --language-model-type yi-34b \
+        --vision-model-type internvit \
+        --micro-batch-size 1 \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --bf16 \
+        --freeze-LM \
+        --freeze-ViT \
+        --img-h 448 \
+        --img-w 448 \
+        --patch-dim 14 \
+        --use-te \
+        --transformer-impl transformer_engine \
+        --use-checkpoint-args \
+        --out-seq-length 16 \
+        --temperature 1.0 \
+        --patch-dim 14 \
+        --seed 1234 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --disable-vision-class-token \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        ${EXTRA_ARGS} \
+        --task ${TASK} \
+        --image-tag-type nvlm \
+        --ckpt-format torch
+done
--- a/Megatron-LM/examples/multimodal/nvlm/sft_34b_internvit.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/sft_34b_internvit.sh
+#!/bin/bash
+
+# Your SBATCH commands here if using SLURM.
+
+# Please launch this script from megatron-lm root.
+
+# Train a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_ALGO=^NVLS
+export TOKENIZERS_PARALLELISM="false"
+
+
+DEBUG=0
+
+if [[ $BATCH -eq 0 ]]; then
+    DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+    MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft-${DATETIME}"
+else
+    MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft"
+fi
+
+WORKSPACE="<some dir>"
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+LOAD_NAME="mcore-nous-yi34b-internvit-mlp"  # From pretraining
+CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
+
+
+if [[ $DEBUG -eq 1 ]]; then
+    MBZ=1
+    BZ=1
+    NW=0
+    LI=1
+    AD=0.0
+    HD=0.0
+    ALLOW_NONDETERMINISTIC=1
+
+    # Can run out of GPU memory in interactive memory without this.
+    # This is just for interactive testing purposes. Do not use for proper training.
+    EXTRA_ARGS=" --freeze-LM"
+else
+    MBZ=1
+    BZ=128
+    NW=2
+    LI=5
+    AD=0.0
+    HD=0.0
+    ALLOW_NONDETERMINISTIC=1
+
+    EXTRA_ARGS=""
+fi
+
+SEQ_LEN=261     # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
+DECODER_SEQ_LEN=3200    # Language model sequence length.
+MAX_POS_EMBED=3200
+
+OPTIONS=" \
+    --swiglu \
+    --use-distributed-optimizer \
+    --num-workers ${NW} \
+    --num-layers 60 \
+    --hidden-size 7168 \
+    --normalization RMSNorm \
+    --num-attention-heads 56 \
+    --exit-duration-in-mins 230 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 20480 \
+    --seq-length ${SEQ_LEN} \
+    --decoder-seq-length ${DECODER_SEQ_LEN} \
+    --max-position-embeddings ${MAX_POS_EMBED} \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
+    --tokenizer-prompt-format nvlm-yi-34b \
+    --vocab-size 64000 \
+    --make-vocab-size-divisible-by 1 \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 5000000 \
+    --disable-bias-linear \
+    --tensor-model-parallel-size 8 \
+    --language-model-type yi-34b \
+    --vision-model-type internvit \
+    --micro-batch-size ${MBZ} \
+    --global-batch-size ${BZ} \
+    --train-samples 30000000 \
+    --lr-decay-samples 25600000 \
+    --lr-warmup-samples 83200 \
+    --lr 2e-6 \
+    --min-lr 2.5e-7 \
+    --lr-decay-style cosine \
+    --split 100,0,0 \
+    --clip-grad 10 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --attention-dropout ${AD} \
+    --hidden-dropout ${HD} \
+    --untie-embeddings-and-output-weights \
+    --eod-mask-loss \
+    --bf16 \
+    --tensorboard-dir=${TENSORBOARD_DIR} \
+    --freeze-ViT \
+    --img-h 448 \
+    --img-w 448 \
+    --patch-dim 14 \
+    --data-path ${DATA_TRAIN} \
+    --dataloader-type external \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
+    --log-interval ${LI} \
+    --load ${FINETUNE_DIR} \
+    --save ${FINETUNE_DIR} \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --save-interval 5000 \
+    --eval-interval 500 \
+    --eval-iters 10 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    ${EXTRA_ARGS} \
+    --disable-vision-class-token \
+    --use-te \
+    --ckpt-format torch \
+    --pixel-shuffle \
+    --use-tiling \
+    --max-num-tiles 6 \
+    --use-thumbnail \
+    --use-tile-tags \
+    --image-tag-type nvlm
+    "
+
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+# Interactive or batch mode
+if [[ $BATCH -eq 0 ]]; then
+    torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
+else
+    run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
+
+    DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+
+    srun -l --verbose \
+    --container-image <path to docker image> \
+    --container-mounts "<some mount>" \
+    --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
+    sh -c "${run_cmd}"
+
+    set +x
+fi
--- a/Megatron-LM/examples/multimodal/nvlm/sft_blend.yaml
+++ b/Megatron-LM/examples/multimodal/nvlm/sft_blend.yaml
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 0.01  # # Datasets are weighted according to their size. Weights sum up to 1.
+        path: <path to coco>
+        subflavors:
+          augmentation: False
+
+      - weight: 0.02
+        path: <path to clevr-math dataset>
+        subflavors:
+          augmentation: False
+
+      # Please refer to Table 6 in https://arxiv.org/pdf/2409.11402 for full list of SFT datasets.
+      # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
+  val:
+    datasets:
+      - weight: 1.
+        path: <path to validation dataset>
+        subflavors:
+          augmentation: False
--- a/Megatron-LM/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh
+#!/bin/bash
+
+# Your SBATCH commands here if using SLURM.
+
+# Please launch this script from megatron-lm root.
+
+# Train a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_ALGO=^NVLS
+export TOKENIZERS_PARALLELISM="false"
+
+DEBUG=0
+
+if [[ $BATCH -eq 0 ]]; then
+    DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+    MODEL_NAME="mcore-qwen20-72b-internvit-sft-${DATETIME}"
+else
+    MODEL_NAME="mcore-qwen20-72b-internvit-sft"
+fi
+
+WORKSPACE="<some dir>"
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR="${OUTPUT}/checkpoints"
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+# From pretraining. The pretraining checkpoint must be manually split to 4 pipeline parallel stages.
+# Please refer to README.md and run examples/multimodal/nvlm/pp_checkpoint_converter.py.
+LOAD_NAME="mcore-qwen20-72b-internvit-pp4"
+
+CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
+
+if [[ $DEBUG -eq 1 ]]; then
+    MBZ=1
+    BZ=1
+    NW=0
+    AD=0.0
+    HD=0.0
+    LI=1
+    # This is just for interactive testing purposes. Do not use for proper training.
+    EXTRA_ARGS="--freeze-LM"
+    ALLOW_NONDETERMINISTIC=1
+else
+    MBZ=1
+    BZ=256
+    NW=8
+    AD=0.0
+    HD=0.0
+    LI=5
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+fi
+
+SEQ_LEN=261     # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
+DECODER_SEQ_LEN=3200    # Language model sequence length.
+MAX_POS_EMBED=8192
+
+OPTIONS=" \
+    --use-checkpoint-args \
+    --exit-duration-in-mins 230 \
+    --disable-bias-linear \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model Qwen/Qwen2-72B-Instruct \
+    --tokenizer-prompt-format qwen2p0 \
+    --transformer-impl transformer_engine \
+    --normalization RMSNorm \
+    --norm-epsilon 1e-06 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --attention-dropout ${AD} \
+    --hidden-dropout ${HD} \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --swiglu \
+    --tensor-model-parallel-size 8  \
+    --pipeline-model-parallel-size 4 \
+    --num-layers 80 \
+    --hidden-size 8192 \
+    --ffn-hidden-size 29568 \
+    --add-qkv-bias \
+    --num-attention-heads 64  \
+    --use-distributed-optimizer \
+    --use-te \
+    --num-workers ${NW} \
+    --seq-length ${SEQ_LEN} \
+    --decoder-seq-length ${DECODER_SEQ_LEN} \
+    --max-position-embeddings 32768 \
+    --train-samples 122880000 \
+    --lr-decay-samples 25600000 \
+    --lr-warmup-samples 83200 \
+    --micro-batch-size ${MBZ} \
+    --global-batch-size ${BZ} \
+    --lr 2e-6 \
+    --min-lr 2.5e-7 \
+    --lr-decay-style cosine \
+    --log-interval ${LI} \
+    --eval-iters 10 \
+    --eval-interval 500 \
+    --data-path ${DATA_TRAIN} \
+    --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
+    --save-interval 10000 \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --split 100,0,0 \
+    --clip-grad 10.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --bf16 \
+    --eod-mask-loss \
+    --freeze-ViT \
+    --patch-dim 14 \
+    --img-h 448 \
+    --img-w 448 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type qwen2.0_72B \
+    ${EXTRA_ARGS} \
+    --vision-model-type internvit \
+    --disable-vision-class-token \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --ckpt-format torch \
+    --pixel-shuffle \
+    --use-tiling \
+    --max-num-tiles 6 \
+    --use-thumbnail \
+    --use-tile-tags \
+    --image-tag-type nvlm
+"
+
+
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
+
+# Interactive or batch mode
+if [[ $BATCH -eq 0 ]]; then
+    torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
+else
+    run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
+
+    DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+
+    srun -l --verbose \
+    --container-image <path to docker image> \
+    --container-mounts "<some mount>" \
+    --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
+    sh -c "${run_cmd}"
+
+    set +x
+fi
--- a/Megatron-LM/examples/multimodal/nvlm/sft_qwen2p5_7b_internvit_6b_video.sh
+++ b/Megatron-LM/examples/multimodal/nvlm/sft_qwen2p5_7b_internvit_6b_video.sh
+#!/bin/bash
+
+# Your SBATCH commands here if using SLURM.
+
+# Please launch this script from megatron-lm root.
+
+# Train a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_ALGO=^NVLS
+export TOKENIZERS_PARALLELISM=false
+
+USER=$SLURM_JOB_USER
+
+# Auto-detect batch or interactive mode.
+which srun
+BATCH=$((1-$?))
+
+DEBUG=0
+
+if [[ $BATCH -eq 0 ]]; then
+    DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+    MODEL_NAME="qwen2.5-7B-internvit-video-sft-nvlm-${DATETIME}"
+else
+    MODEL_NAME="qwen2.5-7B-internvitp-video-sft-nvlm"
+    DEBUG=0
+fi
+
+WORKSPACE="<some dir>"
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR="${OUTPUT}/checkpoints"
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+# From pretraining. The pretraining checkpoint should have tensor parallel size to 4.
+LOAD_NAME="mcore-qwen2p5-7b-internvit-tp4"
+
+CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
+
+if [[ $DEBUG -eq 1 ]]; then
+    MBZ=1
+    BZ=1
+    NW=0
+    AD=0.0
+    HD=0.0
+    LI=1
+    # This is just for interactive testing purposes. Do not use for proper training.
+    EXTRA_ARGS="--freeze-LM"
+    ALLOW_NONDETERMINISTIC=1
+else
+    MBZ=1
+    BZ=256
+    NW=8
+    AD=0.0
+    HD=0.0
+    LI=5
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+fi
+
+USE_TILING=1
+SEQ_LEN=1024
+DECODER_SEQ_LEN=16384
+MAX_POS_EMBED=32768
+TRAIN_SAMPLES=6602173
+WARMUP_SAMPLES=198065
+
+
+if [[ $BATCH -eq 0 ]]; then
+    # Runs out of GPU memory in interactive memory without this.
+    EXTRA_ARGS+="--freeze-LM"
+fi
+
+if [[ $USE_TILING -eq 1 ]]; then
+    EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
+    SEQ_LEN=256
+fi
+
+
+OPTIONS=" \
+    --swiglu \
+    --use-distributed-optimizer \
+    --num-workers ${NW} \
+    --num-layers 28 \
+    --hidden-size 3584 \
+    --norm-epsilon 1e-06 \
+    --normalization RMSNorm \
+    --num-attention-heads 28 \
+    --exit-duration-in-mins 110 \
+    --group-query-attention \
+    --num-query-groups 4 \
+    --ffn-hidden-size 18944 \
+    --add-qkv-bias \
+    --seq-length ${SEQ_LEN} \
+    --decoder-seq-length ${DECODER_SEQ_LEN} \
+    --max-position-embeddings ${MAX_POS_EMBED} \
+    --dataloader-seq-length ${DECODER_SEQ_LEN} \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model Qwen/Qwen2.5-7B-Instruct \
+    --tokenizer-prompt-format qwen2p5 \
+    --pixel-shuffle \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --disable-bias-linear \
+    --pipeline-model-parallel-size 1 \
+    --tensor-model-parallel-size 4 \
+    --language-model-type qwen2.5_7B \
+    --vision-model-type internvit \
+    --micro-batch-size ${MBZ} \
+    --global-batch-size ${BZ} \
+    --lr 2e-6 \
+    --min-lr 2.5e-7 \
+    --train-samples ${TRAIN_SAMPLES} \
+    --lr-warmup-samples ${WARMUP_SAMPLES} \
+    --lr-decay-style cosine \
+    --clip-grad 10 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --attention-dropout ${AD} \
+    --hidden-dropout ${HD} \
+    --eod-mask-loss \
+    --bf16 \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --img-h 448 \
+    --img-w 448 \
+    --patch-dim 14 \
+    --data-path ${DATA_TRAIN} \
+    --dataloader-type external \
+    --split 100,0,0 \
+    --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
+    --log-interval ${LI} \
+    --save-interval 500 \
+    --eval-interval 500 \
+    --eval-iters 10 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    ${EXTRA_ARGS} \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --distributed-timeout-minutes 60 \
+    --allow-missing-vision-projection-checkpoint \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --disable-vision-class-token \
+    --use-te \
+    --ckpt-format torch \
+    --num-frames 32 \
+    --use-checkpoint-args \
+    --image-tag-type internvl \
+    --recompute-granularity full \
+    --recompute-method block \
+    --recompute-num-layers 28 \
+    --recompute-vision \
+"
+
+
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+# Interactive or batch mode
+if [[ $BATCH -eq 0 ]]; then
+    torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
+else
+    run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
+
+    DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+
+    srun -l --verbose \
+    --container-image <path to docker image> \
+    --container-mounts "<some mount>" \
+    --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
+    sh -c "${run_cmd}"
+
+    set +x
+fi
--- a/Megatron-LM/examples/multimodal/pretrain_dataset.yaml
+++ b/Megatron-LM/examples/multimodal/pretrain_dataset.yaml
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 1.
+        path: <path_to_pretraining_dataset_in_energon_format>
+        subflavors:
+          augmentation: false
+  val:
+    datasets:
+      - weight: 1.
+        path: <path_to_pretraining_dataset_in_energon_format>
+        subflavors:
+          augmentation: false
--- a/Megatron-LM/examples/multimodal/pretrain_mistral_clip.sh
+++ b/Megatron-LM/examples/multimodal/pretrain_mistral_clip.sh
+#!/bin/bash
+# Pretrain a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining"
+
+# Check that the user has set an output path for model checkpoints.
+if [[ -z $WORKSPACE ]]; then
+    echo "Please set WORKSPACE for storing your model checkpoints."
+    exit 1
+fi
+
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+if [[ -z $LOAD_NAME ]]; then
+    echo "Please set LOAD_NAME for input model name."
+    exit 1
+fi
+
+CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
+
+DEBUG=0
+if [[ $DEBUG -eq 1 ]]; then
+    BZ=32
+    NW=2
+    HD=0.0
+    LI=1
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+else
+    BZ=256
+    NW=2
+    HD=0.1
+    LI=10
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+fi
+
+OPTIONS=" \
+    --apply-layernorm-1p \
+    --attention-softmax-in-fp32 \
+    --use-checkpoint-args \
+    --use-distributed-optimizer \
+    --transformer-impl transformer_engine \
+    --use-te \
+    --normalization RMSNorm \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --no-masked-softmax-fusion \
+    --num-workers ${NW} \
+    --exit-duration-in-mins 230 \
+    --use-flash-attn \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --swiglu \
+    --attention-dropout 0.0 \
+    --hidden-dropout ${HD} \
+    --tensor-model-parallel-size 4 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 576 \
+    --decoder-seq-length 1024 \
+    --max-position-embeddings 4096 \
+    --ffn-hidden-size 14336 \
+    --train-iters 20000 \
+    --micro-batch-size 1 \
+    --global-batch-size ${BZ} \
+    --lr-decay-iters 20000 \
+    --lr-warmup-fraction .01 \
+    --lr 0.00015 \
+    --min-lr 1.0e-5 \
+    --lr-decay-style cosine \
+    --log-interval ${LI} \
+    --eval-iters 10 \
+    --eval-interval 1000 \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
+    --tokenizer-prompt-format mistral \
+    --data-path ${DATA_TRAIN} \
+    --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
+    --save-interval 1000 \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --split 100,0,0 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-2 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --bf16 \
+    --eod-mask-loss \
+    --freeze-LM \
+    --freeze-ViT \
+    --patch-dim 14 \
+    --img-h 336 \
+    --img-w 336 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type=mistral_7b \
+    --disable-vision-class-token \
+    ${EXTRA_ARGS} \
+    --distributed-timeout-minutes 60 \
+    --allow-missing-vision-projection-checkpoint \
+    --ckpt-format torch
+"
+
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
+
+torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
\ No newline at end of file
--- a/Megatron-LM/examples/multimodal/radio/radio_g.py
+++ b/Megatron-LM/examples/multimodal/radio/radio_g.py
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+from functools import partial
+
+import torch
+
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from examples.multimodal.layer_scaling import LayerScalingTransformerLayer, get_bias_dropout_add_layer_scaling
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
+    LNImpl = WrappedTorchNorm
+
+
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+
+
+def get_norm_mlp_module_spec_te() -> ModuleSpec:
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+        ),
+    )
+
+
+def get_radio_g_layer_spec(normalization) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask
+    if normalization == "LayerNorm":
+        norm = LNImpl
+    elif normalization == "RMSNorm":
+        if HAVE_TE:
+            norm = TENorm
+        else:
+            assert is_torch_min_version("2.4.0"), "Torch version >= 2.4.0 is required for RMSNorm"
+            if HAVE_APEX:
+                warnings.warn(f'Apex does not support RMSNorm. Falling back to Torch Norm')
+            norm = WrappedTorchNorm
+    else:
+        raise RuntimeError("unknown normalization", normalization)
+
+    mlp = get_mlp_module_spec(use_te=False)  # doesn't include norm.
+
+    return ModuleSpec(
+        module=LayerScalingTransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=norm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add_layer_scaling,
+            pre_mlp_layernorm=norm,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add_layer_scaling,
+        ),
+    )
+
+
+def get_radio_g_layer_spec_te() -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask
+
+    mlp = get_norm_mlp_module_spec_te()
+    return ModuleSpec(
+        module=LayerScalingTransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add_layer_scaling,
+            pre_mlp_layernorm=IdentityOp,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add_layer_scaling,
+        ),
+    )