“13.0”

1106877d · jerrrrry · 1106877d · 1106877d · 1106877d · 1106877d
Commit 1106877d authored Sep 23, 2025 by jerrrrry
20 changed files
--- a/Megatron-LM/examples/multimodal/run_text_generation.py
+++ b/Megatron-LM/examples/multimodal/run_text_generation.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Generate text using a vision language model."""
+import json
+import logging
+import os
+import sys
+from functools import partial
+from typing import List, Dict
+
+# Add megatron to the path.
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+import torch
+import yaml
+from config import EvaluationConfig
+from evaluation.evaluation_datasets import get_evaluation_dataset
+from model import model_provider
+from multimodal_args import add_multimodal_extra_args
+
+from megatron.core import parallel_state
+from megatron.core.enums import ModelType
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.inference.text_generation.api import generate_and_post_process
+from megatron.inference.text_generation.forward_step import ForwardStep
+from megatron.core.inference.contexts import StaticInferenceContext
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.engines import StaticInferenceEngine
+from megatron.core.inference.inference_request import InferenceRequest, VLMInferenceRequest
+from megatron.core.inference.text_generation_controllers.vlm_text_generation_controller import (
+    VLMTextGenerationController,
+)
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import (
+    VLMInferenceWrapper,
+)
+from megatron.training import get_args, get_model, get_tokenizer, print_rank_0, is_last_rank
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def is_first_rank():
+    """First tensor and pipeline parallel rank."""
+    return (
+        parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+        and parallel_state.get_tensor_model_parallel_rank() == 0
+    )
+
+
+def add_text_generation_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='Vision language model text generation arguments')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
+    group.add_argument(
+        "--out-seq-length", type=int, default=128, help='Length of the output generated text.'
+    )
+    group.add_argument("--output-path", type=str, help='Output file path')
+    group.add_argument('--input-image-path', type=str, help="Input image directory")
+    group.add_argument(
+        '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
+    )
+    group.add_argument('--partition-id', type=int, default=0, help="Partition index")
+    group.add_argument("--gt-path", type=str, help="Optional ground truth file")
+    group.add_argument(
+        "--task",
+        type=str,
+        choices=[
+            "captioning",
+            "TextVQA",
+            "VQAv2",
+            "ChartQA",
+            "MMMU",
+            "OCRBench",
+            "OCRBench_v2",
+            "MathVista",
+            "AI2D",
+            "InfoVQA",
+            "SPDocVQA",
+            "RD_TableBench",
+            "VideoMME",
+            "PerceptionTest",
+            "MotionBench",
+            "PhysGameBench",
+            "MVBench",
+            "inference",
+        ],
+        help="Generation task to run",
+    )
+    group.add_argument(
+        "--num-samples-per-partition", type=int, default=0, help="Number of samples per partition"
+    )
+    group.add_argument("--config-path", type=str, help="Evaluation config file to use.")
+
+    # Add common multimodal arguments needed for e.g. building the model.
+    parser = add_multimodal_extra_args(parser)
+
+    return parser
+
+
+def get_evaluation_dataloader(
+    task,
+    input_image_path,
+    gt_path,
+    img_h,
+    img_w,
+    use_tiling,
+    max_num_tiles,
+    use_thumbnail,
+    num_samples_per_partition,
+    num_partitions,
+    partition_id,
+    num_frames,
+    num_workers,
+    vision_model_type,
+    split="validation"
+):
+    """Build evaluation dataset."""
+    dataset = get_evaluation_dataset(
+        task,
+        input_image_path,
+        gt_path,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        num_frames,
+        vision_model_type,
+        split=split
+    )
+
+    dp_rank = parallel_state.get_data_parallel_rank()
+    dp_world_size = parallel_state.get_data_parallel_world_size()
+
+    sampler = torch.utils.data.DistributedSampler(
+        dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank
+    )
+    # TODO: Batched inference is not supported yet.
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True
+    )
+
+    return dataloader
+
+
+def generate_samples(model, config: EvaluationConfig, print_output):
+    """Text generation using a trained vision language model."""
+    args = get_args()
+
+    dataloader = get_evaluation_dataloader(
+        config.task,
+        config.input_image_path,
+        config.gt_path,
+        args.img_h,
+        args.img_w,
+        args.use_tiling,
+        args.max_num_tiles,
+        args.use_thumbnail,
+        config.num_samples_per_partition,
+        config.num_partitions,
+        config.partition_id,
+        args.num_frames,
+        args.num_workers,
+        args.vision_model_type,
+        config.split
+    )
+
+    num_img_embeddings_per_tile = get_num_image_embeddings(
+        args.img_h,
+        args.img_w,
+        args.patch_dim,
+        args.vision_model_type,
+        args.disable_vision_class_token,
+        1,
+        args.pixel_shuffle,
+        args.use_tile_tags,
+        args.max_num_tiles,
+        args.tokenizer_prompt_format,
+    )
+
+    if args.use_mcore_inference:
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=args.hidden_size,
+            inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+            fp32_residual_connection=args.fp32_residual_connection,
+            params_dtype=args.params_dtype,
+            padded_vocab_size=args.padded_vocab_size,
+        )
+        inference_wrapped_model = VLMInferenceWrapper(model, inference_wrapper_config)
+        tokenizer = get_tokenizer()
+        controller = VLMTextGenerationController(
+            inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+        )
+        inference_engine = StaticInferenceEngine(
+            controller, max_batch_size=1, random_seed=args.seed
+        )
+        sampling_params = SamplingParams(
+            temperature=config.temperature,
+            top_k=config.top_k,
+            top_p=config.top_p,
+            num_tokens_to_generate=config.out_seq_length,
+        )
+
+    for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader):
+        imgs = imgs.to("cuda")
+        num_tiles = num_tiles.to("cuda")
+
+        conv = get_conversation(config.task, question, metadata)
+
+        if not args.use_mcore_inference:
+            forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles, args.decoder_seq_length)
+
+        inference_context = StaticInferenceContext(max_batch_size=1, max_sequence_length=args.inference_max_seq_length)
+        if is_first_rank():
+
+            if args.use_mcore_inference:
+                inference_request = VLMInferenceRequest(
+                   request_id=inference_engine.get_new_request_id(),
+                   prompt=conv,
+                   prompt_tokens=controller.tokenize_prompt(conv),
+                   sampling_params=sampling_params,
+                   num_img_embeddings_per_tile=num_img_embeddings_per_tile,
+                   imgs=imgs,
+                   num_tiles=num_tiles,
+                   decoder_seq_length=args.decoder_seq_length,
+                )
+                results: List[InferenceRequest] = inference_engine.generate(
+                    inference_requests=[inference_request]
+                )
+
+                resp_sentences = [
+                    tokenizer.detokenize(result.prompt_tokens) + result.generated_text
+                    for result in results
+                ]
+            else:
+                resp_sentences, _, _, _ = generate_and_post_process(
+                    model, inference_context,
+                    forward_step=forward_step,
+                    prompts=[conv],
+                    tokens_to_generate=config.out_seq_length,
+                    top_k_sampling=config.top_k,
+                    top_p_sampling=config.top_p,
+                    add_BOS=False,
+                    temperature=config.temperature,
+                    random_seed=args.seed,
+                    detokenize_segments=False,
+                    data_parallel=True,
+            )
+
+            for generation in resp_sentences:
+                if isinstance(sample_id, torch.Tensor):
+                    sample_id = sample_id.item()
+
+                output = {"sample_id": sample_id}
+
+                output_name = ""
+                if config.task == "captioning":
+                    output_name = "caption"
+                elif config.task in (
+                    "TextVQA",
+                    "VQAv2",
+                    "ChartQA",
+                    "OCRBench",
+                    "MathVista",
+                    "AI2D",
+                    "RealworldQA",
+                    "MotionBench",
+                    "PhysGameBench",
+                    "MVBench",
+                    "InfoVQA",
+                    "SPDocVQA",
+                    "inference",
+                ):
+                    output_name = "answer"
+                elif config.task in ("MMMU"):
+                    output_name = "text"
+                elif config.task == "VideoMME":
+                    output_name = "response"
+                    output = question
+                elif config.task in ["OCRBench_v2", "RD_TableBench"]:
+                    output_name = "predict"
+                else:
+                    raise NotImplementedError("no output name defined for", config.task)
+
+                prompt, generated = get_prompt_and_generated(
+                    generation, args.tokenizer_prompt_format
+                )
+                if config.task == "VideoMME":
+                    output["questions"][0][output_name] = generated
+                else:
+                    output["prompt"] = prompt
+                    output[output_name] = generated
+
+                if config.task in ["captioning", "RD_TableBench"]:
+                    output["ground_truth"] = answers
+                elif config.task in (
+                    "TextVQA",
+                    "VQAv2",
+                    "ChartQA",
+                    "OCRBench",
+                    "OCRBench_v2",
+                    "MathVista",
+                    "AI2D",
+                    "PerceptionTest",
+                    "RealworldQA",
+                    "MotionBench",
+                    "PhysGameBench",
+                    "MVBench",
+                    "InfoVQA",
+                    "SPDocVQA",
+                    "inference",
+                ):
+                    if isinstance(answers, str):
+                        answers = [answers]
+                    output["gt_answer"] = answers
+
+                    if len(metadata) > 0:
+                        output.update(metadata)
+                elif config.task == "MMMU":
+                    output["prediction"] = generated
+                    output.update(metadata)
+                elif config.task == "VideoMME":
+                    pass
+                else:
+                    raise NotImplementedError("no output processing defined for", config.task)
+
+                if print_output:
+                    print(output)
+
+                yield output
+                idx += 1
+        else:
+            if args.use_mcore_inference:
+                inference_request = VLMInferenceRequest(
+                   request_id=inference_engine.get_new_request_id(),
+                   prompt=conv,
+                   prompt_tokens=controller.tokenize_prompt(conv),
+                   sampling_params=sampling_params,
+                   num_img_embeddings_per_tile=num_img_embeddings_per_tile,
+                   imgs=imgs,
+                   num_tiles=num_tiles,
+                   decoder_seq_length=args.decoder_seq_length,
+                )
+                inference_engine.generate(
+                    inference_requests=[inference_request]
+                )
+            else:
+                generate_and_post_process(
+                    model, inference_context, forward_step=forward_step, detokenize_segments=False, data_parallel=True
+                )
+
+            idx += 1
+
+
+def get_evaluation_configs(config_path=None) -> Dict[str, EvaluationConfig]:
+    """Get evaluation config(s) from a config file or command-line arguments.
+
+    Args:
+        config_path: Optional path to config file. If not provided, will check args.config_path
+                    or fall back to command-line arguments.
+
+    Returns:
+        Dict[str, EvaluationConfig]: dict of configs.
+    """
+    args = get_args()
+    configs = {}
+
+    # Use provided config_path or fall back to args.config_path
+    config_file = config_path or args.config_path
+
+    # We check if we're trying to run a single config evals by checking for the task and output_path
+    # args.
+    if hasattr(args, "task") and args.task and hasattr(args, "output_path") and args.output_path:
+        # Single config from args
+        config = EvaluationConfig(
+            task=args.task,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            out_seq_length=args.out_seq_length,
+            output_path=args.output_path,
+            input_image_path=args.input_image_path,
+            gt_path=args.gt_path,
+            num_partitions=args.num_partitions,
+            partition_id=args.partition_id,
+            num_samples_per_partition=args.num_samples_per_partition,
+        )
+        if not config.output_path:
+            default_output_dir = args.output_path if args.output_path else "generated"
+            os.makedirs(default_output_dir, exist_ok=True)
+            config.output_path = os.path.join(default_output_dir, args.language_model_type)
+        return {args.task: config}
+    elif config_file:
+        with open(config_file, "r") as f:
+            config_data = yaml.safe_load(f)
+        if 'datasets' not in config_data:
+            print("Error: 'datasets' key not found in config file for batch mode.")
+            sys.exit(1)
+        config_dict = config_data['datasets']
+        for key, value in config_dict.items():
+            config = EvaluationConfig(**value)
+            config.dataset = key
+            if not config.output_path:
+                # Use args.output_path if available, otherwise use "generated"
+                default_output_dir = getattr(args, 'output_path', None) or "generated"
+                os.makedirs(default_output_dir, exist_ok=True)
+                config.output_path = os.path.join(default_output_dir, f"{args.language_model_type}")
+            configs[key] = config
+        return configs
+    else:
+        raise ValueError("No config file provided and no task specified.")
+
+
+def get_output_path(config, dp_rank):
+    """Generation output path."""
+
+    ckpt_step = None
+    try:
+        args = get_args()
+        ckpt_step = args.ckpt_step
+    except Exception as e:
+        print(f"Failed getting args: {type(e).__name__} - {e}")
+    if ckpt_step is not None:
+        return f"{config.output_path}-{config.task}-dprank={dp_rank}-partition={config.partition_id}-step={args.ckpt_step}.jsonl"
+    else:
+        return f"{config.output_path}-{config.task}-dprank={dp_rank}-partition={config.partition_id}.jsonl"
+
+
+def generate_and_write_samples(model, config, print_output=True):
+    """Generate text and write to an output file."""
+    dp_rank = parallel_state.get_data_parallel_rank()
+
+    if is_first_rank():
+        output_path = get_output_path(config, dp_rank)
+        output_file = open(output_path, "w")
+        print(f"output path: {output_file.name}")
+
+    with torch.no_grad():
+        for output in generate_samples(model, config, print_output):
+            if is_first_rank():
+                output_file.write(json.dumps(output) + "\n")
+                output_file.flush()
+
+    if is_first_rank():
+        output_file.close()
+
+class VLMForwardStep(ForwardStep):
+    """Inference forward step for a multimodal model."""
+
+    def __init__(
+        self,
+        num_img_embeddings_per_tile,
+        images,
+        num_tiles,
+        decoder_seq_length,
+        model,
+        inference_context,
+    ):
+        """Create multimodal forward step."""
+        total_num_tiles = torch.sum(num_tiles).item()
+        num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
+
+        super().__init__(model, inference_context)
+        self._images = images
+        self._num_tiles = num_tiles
+        self._num_img_embeddings = num_img_embeddings
+        self.decoder_seq_length = decoder_seq_length
+
+        self._recv_only_vision_embeds = False
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        # Checks if the previous stage only has a vision encoder, and that the current stage has part of the LM decoder.
+        # In this case, the current stage should only receive vision embeddings.
+        if pp_rank > 0:
+            self._recv_only_vision_embeds = parallel_state.is_inside_encoder(pp_rank - 1) and (not parallel_state.is_inside_decoder(pp_rank - 1)) and parallel_state.is_inside_decoder()
+
+        # Checks if the current stage only has a vision encoder
+        self._encoder_only = parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder()
+
+    def _forward(self, tokens, position_ids, attention_mask):
+        return self.model(
+            self._images,
+            tokens,
+            position_ids,
+            attention_mask=None,
+            inference_context=self.inference_context,
+            num_image_tiles=self._num_tiles,
+            runtime_gather_output=True,
+        )
+
+    def __call__(self, tokens, position_ids, attention_mask):
+        num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
+        num_tokens = tokens.size(1)
+        recv_buffer_seq_length = None
+        if num_image_tokens > 0:
+            # When there are image tokens and this stage only receives vision embeddings, adjust the recv buffer seq length to match the image embeddings sequence length.
+            # If there are image tokens and this stage receives full embeddings, make sure we compensate for expansion of image tokens.
+            # Note that this will set a recv_buffer_seq_length for the encoder stage, this length is irrelevant since that recv buffer is never allocated.
+            if self._recv_only_vision_embeds:
+                recv_buffer_seq_length = self._num_img_embeddings
+            else:
+                recv_buffer_seq_length = min(self._num_img_embeddings + num_tokens - num_image_tokens, self.decoder_seq_length)
+        elif self._recv_only_vision_embeds:
+            # If this stage only receives vision embeddings and there are no image tokens we won't run the encoder and therefore shouldn't try to recv.
+            recv_buffer_seq_length = 0
+
+        # If the pipeline stage only has a vision encoder, then it only needs to run when there are image tokens
+        if not (self._encoder_only and num_image_tokens == 0):
+            output = super().__call__(tokens, position_ids, attention_mask, recv_buffer_seq_length=recv_buffer_seq_length)
+        else:
+            output = None
+        if isinstance(output, tuple):
+            logits, _ = output
+        else:
+            logits = output
+
+        # On the first inference iteration, we compute image tokens.
+        # On every PP stage(although inference params should only matter for decoder),
+        # update the sequence length offset by the number of image tokens.
+        if num_tokens > 1 and num_image_tokens > 0:
+            if "image_tokens_count" not in self.inference_context.key_value_memory_dict:
+                self.inference_context.key_value_memory_dict["image_tokens_count"] = self._num_img_embeddings
+
+            if self._num_img_embeddings + num_tokens - num_image_tokens > self.decoder_seq_length:
+                self.inference_context.sequence_len_offset += self.decoder_seq_length - num_tokens
+            else:
+                self.inference_context.sequence_len_offset += (
+                    self.inference_context.key_value_memory_dict["image_tokens_count"] - num_image_tokens
+                )
+
+        return logits
+
+
+def get_conversation(task, question, metadata=None):
+    """Get a conversation for a given task and evaluation question."""
+    conversation = []
+
+    # In all cases, the tokenizer adds possible header tokens for the assistant.
+    if task == "captioning":
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {
+                "role": "user",
+                "content": f"{IMAGE_TOKEN}\nGive a brief description of this image in one sentence.",
+            },
+        ]
+    elif task in ("TextVQA", "InfoVQA", "SPDocVQA"):
+        conversation = [
+            {"role": "system", "content": "Follow the user's instruction and answer questions."},
+            {
+                "role": "user",
+                "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word, phrase, or number.",
+            },
+        ]
+    elif task == "VQAv2":
+        conversation = [
+            {"role": "system", "content": "Follow the user's instruction and answer questions."},
+            {
+                "role": "user",
+                "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
+            },
+        ]
+    elif task == "ChartQA":
+        conversation = [
+            {"role": "system", "content": "Follow the user's instruction and answer questions."},
+            {
+                "role": "user",
+                "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
+            },
+        ]
+    elif task == "MMMU":
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
+        ]
+    elif task == "VideoMME":
+        q = (
+            "Select the best answer to the following multiple-choice "
+            "question based on the video. Respond with only the letter "
+            "(A, B, C, or D) of the correct option.\n"
+        )
+        q += question["questions"][0]["question"] + "\n"
+        q += question["questions"][0]["choices"][0] + "\n"
+        q += question["questions"][0]["choices"][1] + "\n"
+        q += question["questions"][0]["choices"][2] + "\n"
+        q += question["questions"][0]["choices"][3] + "\n"
+
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{q}"},
+        ]
+    elif task in ("OCRBench", "OCRBench_v2", "RD_TableBench"):
+        conversation = [
+            {"role": "system", "content": "Follow the user's instruction and answer questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
+        ]
+    elif task == "MathVista":
+        conversation = [
+            {"role": "system", "content": "You are math expert. Use your math knowledge to calculate the answer."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
+        ]
+    elif task == "RealworldQA":
+        conversation = [
+            {"role": "system", "content": "Follow the user's instruction and answer questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
+        ]
+    elif task == "AI2D":
+        conversation = [
+            {"role": "system", "content": "Follow the user's instruction and answer questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
+        ]
+    elif task == "MotionBench":
+        extra_instruction = "Respond with only the letter choice (A, B, C, or D) of the correct option.\n"
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\n{extra_instruction}"},
+        ]
+    elif task == "PhysGameBench":
+        extra_instruction = "Respond with only the letter choice (A, B, C, or D) of the correct option.\n"
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\n{extra_instruction}"},
+        ]
+    elif task == "MVBench":
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase."},
+        ]
+    elif task in ["PerceptionTest"]:
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
+        ]
+    elif task == "inference":
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": f"{question}"},
+        ]
+    else:
+        raise NotImplementedError(f"No prompting support for task {task}")
+
+
+    return conversation
+
+
+def get_prompt_and_generated(prompt_and_generation, prompt_format):
+    """Strip prompt and other unnecessary text from generation."""
+    if prompt_format in ("llama3", "llama3p1"):
+        splitted = prompt_and_generation.split("<|start_header_id|>assistant<|end_header_id|>\n\n")
+        prompt = splitted[0]
+        generated = splitted[1]
+        generated = generated.split("<|eot_id|>")[0]
+    elif prompt_format == "mistral":
+        splitted = prompt_and_generation.split("[/INST]")
+        prompt = splitted[0]
+        generated = splitted[1]
+        generated = generated.split("</s>")[0]
+    elif prompt_format == "chatml":
+        splitted = prompt_and_generation.split("<|im_start|> assistant\n")
+        prompt = splitted[0]
+        generated = splitted[1]
+        generated = generated.split("<|im_end|>")[0]
+    elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"):
+        splitted = prompt_and_generation.split("<|im_start|>assistant\n")
+        prompt = splitted[0]
+        generated = splitted[1]
+        generated = generated.split("<|im_end|>")[0]
+    elif prompt_format in ("nemotron5"):
+        splitted = prompt_and_generation.split("<SPECIAL_14>assistant\n")
+        prompt = splitted[0]
+        generated = splitted[1]
+        generated = generated.split("<SPECIAL_15>")[0]
+    elif prompt_format in ("nemotron5-aligned"):
+        splitted = prompt_and_generation.split("Assistant\n")
+        prompt = splitted[0]
+        generated = splitted[1]
+        generated = generated.split("[PREFIX]")[0]
+        generated = generated.split("\\n")[0]
+    else:
+        raise ValueError(f"Prompt format {prompt_format} is not supported.")
+
+    # Remove possible garbage.
+    generated = generated.strip()
+
+    return prompt, generated
+
+
+def run_eval(config, iteration=None):
+    # Run evaluation.
+    print(f"====== {config.task} {config.dataset} at iteration={iteration} scores ======")
+
+    if config.task == "TextVQA":
+        from evaluation.evaluate_textvqa import textvqa_eval
+        avg_acc = textvqa_eval(config.output_path)
+
+        score = {"TextVQA accuracy": avg_acc}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} at iteration={iteration} TextVQA accuracy: {score}\n")
+
+    elif config.task == "OCRBench":
+        from evaluation.evaluate_ocrbench import ocrbench_eval
+        log, avg_acc = ocrbench_eval(config.output_path)
+
+        score = {"OCRBench accuracy": avg_acc}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} at iteration={iteration} OCRBench accuracy: {score}\n")
+            f.write(f"{log}\n")
+
+    elif config.task == "MathVista":
+        from evaluation.evaluate_mathvista import mathvista_eval
+        avg_acc = mathvista_eval(config.output_path)
+
+        score = {"MathVista accuracy": avg_acc}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} at iteration={iteration} MathVista accuracy: {score}\n")
+
+    elif config.task == "ChartQA":
+        from evaluation.evaluate_chartqa import chartqa_eval
+        avg_acc = chartqa_eval(config.output_path)
+
+        score = {"ChartQA accuracy": avg_acc}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} at iteration={iteration} ChartQA accuracy: {score}\n")
+
+    elif config.task == "SPDocVQA":
+        from evaluation.evaluate_spdocvqa import spdocvqa_eval
+        avg_acc = spdocvqa_eval(config.output_path)
+
+        score = {"SPDocVQA accuracy": avg_acc}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} at iteration={iteration} SPDocVQA accuracy: {score}\n")
+
+    elif config.task == "RealworldQA":
+        from evaluation.evaluate_realworldqa import realworldqa_eval
+        avg_acc = realworldqa_eval(config.output_path)
+
+        score = {"RealworldQA accuracy": avg_acc}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} at iteration={iteration} RealworldQA accuracy: {score}\n")
+
+    elif config.task == "AI2D":
+        from evaluation.evaluate_ai2d import ai2d_eval
+        avg_acc = ai2d_eval(config.output_path)
+
+        score = {f"AI2D {config.dataset} accuracy": avg_acc}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} at iteration={iteration} AI2D accuracy: {score}\n")
+
+    elif config.task == "MMMU":
+        from evaluation.evaluate_mmmu import convert_to_mmmu_format
+        from examples.multimodal.evaluation.mmmu_utils import mmmu_main_eval
+        result_file = convert_to_mmmu_format(config.output_path)
+        result = json.load(open(result_file))
+        mmmu_results = mmmu_main_eval(result, {"answer_dict": config.gt_path})
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.split} at iteration={iteration} :\n")
+            for cat, cat_val in mmmu_results.items():
+                if 'Overall' in cat:
+                    cat = cat.replace("Overall-", "")
+                    print(f'{cat}: {cat_val["acc"] * 100:.2f}')
+                    f.write(f'{cat}: {cat_val["acc"] * 100:.2f}\n')
+
+        score = {"MMMU val accuracy": mmmu_results['Overall']['acc']}
+    elif config.task == 'captioning':
+        from evaluation.evaluate_coco import coco_captioning_eval
+        cider_score = coco_captioning_eval(config.output_path, config.gt_path)
+        score = {f"{config.task} {config.dataset} CIDEr": cider_score}
+
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} CIDEr scores at iteration={iteration}: {cider_score}\n")
+    elif config.task == 'MotionBench':
+        from evaluation.evaluate_video_motionbench import motionbench_eval
+        avg_acc = motionbench_eval(config.output_path)
+
+        score = {f"MotionBench accuracy": avg_acc}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {score}\n")
+    elif config.task == 'PhysGameBench':
+        from evaluation.evaluate_video_phys_game_bench import phys_game_bench_eval
+        avg_acc_dict = phys_game_bench_eval(config.output_path)
+
+        score = {f"PhysGame Total accuracy": avg_acc_dict['Physgame-Total-Acc']}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {avg_acc_dict}\n")
+    elif config.task == "MVBench":
+        from evaluation.evaluate_video_mvbench import mvbench_eval
+        avg_acc_dict = mvbench_eval(config.output_path)
+
+        score = {f"MVBench accuracy": avg_acc_dict['total-acc']}
+        with open(config.output_path + "-scores.txt", "a") as f:
+            f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {avg_acc_dict}\n")
+    elif config.task == "inference":
+        score = {"Inference accuracy:": None}
+        pass
+    else:
+        raise NotImplementedError(f"Evaluation of {config.task} not implemented yet")
+
+    print(score)
+    return score
+
+
+def run_evaluation_loop(model, configs, output_dir_override=None, iteration=None, print_output=True):
+    """
+    Common evaluation loop used by both online evaluation during training and standalone evaluation.
+
+    Args:
+        model: The model to evaluate
+        configs: Dict[str, EvaluationConfig] - dictionary of evaluation configs
+        output_dir_override: Optional directory to override the output path in configs
+        iteration: Optional iteration number for logging
+        print_output: Whether to print generation output
+
+    Returns:
+        Dict[str, float]: Dictionary of evaluation scores
+    """
+    args = get_args()
+    scores = {}
+
+    for key, config in configs.items():
+        # Handle output path override for online evaluation
+        if output_dir_override:
+            config.output_path = os.path.join(output_dir_override, args.language_model_type)
+
+        # Generate samples and write to file
+        generate_and_write_samples(model, config, print_output=print_output)
+
+        # Synchronize before evaluation
+        torch.distributed.barrier()
+
+        # Run evaluation on the last rank
+        if is_last_rank():
+            task_scores = run_eval(config, iteration=iteration)
+            scores.update(task_scores)
+
+        # Synchronize after evaluation
+        torch.distributed.barrier()
+
+    return scores
+
+
+def eval_tasks():
+    """Vision language model text generation for single or batch tasks."""
+    initialize_megatron(extra_args_provider=add_text_generation_args)
+
+    args = get_args()
+
+    def wrapped_model_provider(pre_process, post_process, add_encoder=True, add_decoder=True):
+        return model_provider(pre_process, post_process, add_encoder=add_encoder, add_decoder=add_decoder,
+                              parallel_output=False)
+
+    # Set up model and load checkpoint.
+    model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False)
+
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    model = model[0]
+    model.eval()
+
+    configs = get_evaluation_configs()
+
+    # Use the common evaluation loop
+    run_evaluation_loop(model, configs, iteration=args.ckpt_step)
+
+
+if __name__ == "__main__":
+    eval_tasks()
--- a/Megatron-LM/examples/multimodal/sft_dataset.yaml
+++ b/Megatron-LM/examples/multimodal/sft_dataset.yaml
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 1.
+        path: <path_to_sft_dataset_in_energon_format>
+        subflavors:
+          augmentation: false
+  val:
+    datasets:
+      - weight: 1.
+        path: <path_to_sft_dataset_in_energon_format>
+        subflavors:
+          augmentation: false
--- a/Megatron-LM/examples/multimodal/sft_mistral_clip.sh
+++ b/Megatron-LM/examples/multimodal/sft_mistral_clip.sh
+#!/bin/bash
+# Run SFT on a pretrained multimodal model
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-sft"
+
+# Check that the user has set an output path for model checkpoints.
+if [[ -z $WORKSPACE ]]; then
+    echo "Please set WORKSPACE for storing your model checkpoints."
+    exit 1
+fi
+
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+if [[ -z $LOAD_NAME ]]; then
+    echo "Please set LOAD_NAME for input model name."
+    exit 1
+fi
+
+if [[ -z $LOAD_ITER ]]; then
+    echo "Please set LOAD_ITER for pre-trained input model iteration."
+    exit 1
+fi
+
+CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
+
+DEBUG=0
+if [[ $DEBUG -eq 1 ]]; then
+    BZ=8
+    NW=1
+    HD=0.0
+    LI=1
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+else
+    BZ=128
+    NW=2
+    HD=0.1
+    LI=10
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+fi
+
+OPTIONS=" \
+    --apply-layernorm-1p \
+    --attention-softmax-in-fp32 \
+    --use-checkpoint-args \
+    --use-distributed-optimizer \
+    --transformer-impl transformer_engine \
+    --use-te \
+    --normalization RMSNorm \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --no-masked-softmax-fusion \
+    --num-workers ${NW} \
+    --exit-duration-in-mins 230 \
+    --use-flash-attn \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --swiglu \
+    --attention-dropout 0.0 \
+    --hidden-dropout ${HD} \
+    --tensor-model-parallel-size 4 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 576 \
+    --decoder-seq-length 2048 \
+    --max-position-embeddings 4096 \
+    --ffn-hidden-size 14336 \
+    --train-iters 20000 \
+    --micro-batch-size 1 \
+    --global-batch-size ${BZ} \
+    --lr-decay-iters 20000 \
+    --lr-warmup-fraction .01 \
+    --lr 1e-6 \
+    --min-lr 1e-7 \
+    --lr-decay-style cosine \
+    --log-interval ${LI} \
+    --eval-iters 10 \
+    --eval-interval 500 \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
+    --tokenizer-prompt-format mistral \
+    --data-path ${DATA_TRAIN} \
+    --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
+    --save-interval 500 \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --split 100,0,0 \
+    --clip-grad 0.5 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --eod-mask-loss \
+    --freeze-ViT \
+    --patch-dim 14 \
+    --img-h 336 \
+    --img-w 336 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type=mistral_7b \
+    --disable-vision-class-token \
+    ${EXTRA_ARGS} \
+    --distributed-timeout-minutes 60 \
+    --ckpt-format torch
+"
+
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
+
+torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
--- a/Megatron-LM/examples/multimodal/text_generation_mistral_clip.sh
+++ b/Megatron-LM/examples/multimodal/text_generation_mistral_clip.sh
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+INPUT_IMAGE_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
+NUM_FRAMES=1
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i|--input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        --num-frames)
+            NUM_FRAMES="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        -t|--task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --apply-layernorm-1p \
+        --attention-softmax-in-fp32 \
+        --use-flash-attn \
+        --transformer-impl transformer_engine \
+        --use-te \
+        --use-checkpoint-args \
+        --normalization RMSNorm \
+        --language-model-type mistral_7b \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 1000000 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --tensor-model-parallel-size 4 \
+        --pipeline-model-parallel-size 1 \
+        --group-query-attention \
+        --num-query-groups 8 \
+        --num-layers 32 \
+        --hidden-size 4096 \
+        --ffn-hidden-size 14336 \
+        --num-attention-heads 32 \
+        --max-position-embeddings 4096 \
+        --no-masked-softmax-fusion \
+        --load ${MODEL_PATH} \
+        --tokenizer-type MultimodalTokenizer \
+        --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
+        --tokenizer-prompt-format mistral \
+        --bf16 \
+        --micro-batch-size 1 \
+        --seq-length 2048 \
+        --out-seq-length 12 \
+        --temperature 1.0 \
+        --img-h 336 \
+        --img-w 336 \
+        --patch-dim 14 \
+        --seed 153 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --task ${TASK} \
+        --disable-vision-class-token \
+        --num-frames ${NUM_FRAMES} \
+        --ckpt-format torch
+done
--- a/Megatron-LM/examples/multimodal/train.py
+++ b/Megatron-LM/examples/multimodal/train.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain or SFT multimodal."""
+import math
+import os
+import sys
+from functools import partial
+
+import torch
+import yaml
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from dataloader_provider import train_valid_test_dataloaders_provider, is_first_or_last_stage
+from model import model_provider
+from multimodal_args import add_multimodal_extra_args
+
+from megatron.core import mpu, tensor_parallel
+from megatron.core.enums import ModelType
+from megatron.core.models.multimodal import context_parallel
+from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, LLaVAModel
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_pipeline_model_parallel_world_size,
+    is_pipeline_last_stage,
+)
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain
+from megatron.training.utils import is_last_rank, get_batch_on_this_cp_rank
+
+
+def get_batch(data_iterator, image_token_index, img_seq_len):
+    """Generate a batch
+
+    Note: attn_mask_type in layer_specs.py sets the attention mask. Attention mask is None here.
+    """
+    imgs = None
+    tokens = None
+    labels = None
+    loss_mask = None
+    attention_mask = None
+    position_ids = None
+    num_tiles = None
+    packed_seq_params = None
+
+    args = get_args()
+
+    # Dataloader doesn't run on the middle stages in a pipeline parallel model.
+    pp_size = get_pipeline_model_parallel_world_size()
+    if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size):
+        # Note these are all set to None above.
+        return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params
+
+    # Broadcast data.
+    torch.cuda.nvtx.range_push("get_data")
+    if data_iterator is not None and get_tensor_model_parallel_rank() == 0:
+        data = next(data_iterator)
+    else:
+        data = None
+
+    data_text = tensor_parallel.broadcast_data(["tokens"], data, torch.int64)["tokens"]
+    labels = tensor_parallel.broadcast_data(["labels"], data, torch.int64)["labels"]
+
+    imgs = tensor_parallel.broadcast_data(["imgs"], data, torch.float32)["imgs"]
+    num_tiles = tensor_parallel.broadcast_data(["num_tiles"], data, torch.int32)["num_tiles"]
+
+    cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"]
+    max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"]
+
+    # No image input (text-only sample) if the dataloader returned a size 1 image.
+    if imgs.shape == torch.Size([1, 1]):
+        # FSDP can hang with text-only samples. A workaround is to run a valid dummy image through the vision
+        # model and then add image embeddings with a zero multiplier.
+        if args.use_torch_fsdp2:
+            imgs = torch.zeros((1, 3, args.img_h, args.img_w), dtype=torch.float32, device=data_text.device)
+            num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
+        else:
+            # Similar workaround is not needed without FSDP and we can use an empty image.
+            # FIXME: text-only data can cause still cause a hang in the special case where
+            # the vision model is own its own pipeline rank and --freeze-ViT is enabled.
+            imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
+            num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
+
+    # Last pipeline parallel stage doesn't need images.
+    if pp_size > 1 and is_pipeline_last_stage():
+        imgs = None
+
+    # If cu_lengths and max_lengths are non-dummy, construct PackedSeqParams. Otherwise, leave it at None.
+    if cu_lengths.shape != torch.Size([1, 1]):
+        assert (
+            cu_lengths.shape[0] == max_lengths.shape[0] == 1
+        ), "micro-batch-size must be 1 for packing"
+        cu_lengths = cu_lengths[0]
+        max_lengths = max_lengths[0]
+
+        packed_seq_params = PackedSeqParams(
+            qkv_format="thd",
+            cu_seqlens_q=cu_lengths,
+            cu_seqlens_kv=cu_lengths,
+            max_seqlen_q=max_lengths,
+            max_seqlen_kv=max_lengths,
+        )
+
+    torch.cuda.nvtx.range_pop()
+
+    tokens_ = data_text.long()
+
+    torch.cuda.nvtx.range_push("index tokens")
+    tokenizer = get_tokenizer()
+    text_length = tokens_.shape[1]
+    tokens = tokens_[:, :text_length].contiguous()
+    labels = labels[:, 1 : text_length + 1].contiguous()
+
+    assert tokens.shape == labels.shape, f"tokens: {tokens.shape} != labels: {labels.shape}"
+    torch.cuda.nvtx.range_pop()
+
+    torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids")
+    loss_mask, position_ids = get_ltor_masks_and_position_ids(tokens, labels, tokenizer.pad)
+    torch.cuda.nvtx.range_pop()
+
+    # If context parallel is enabled, must shard inputs to CP ranks.
+    if args.context_parallel_size > 1 or args.sequence_parallel:
+        assert tokens.shape[0], "micro-batch-size > 1 not supported yet with CP"
+
+        num_image_tokens = torch.sum(tokens == image_token_index).item()
+        num_image_embeddings = img_seq_len * imgs.shape[0] - num_image_tokens
+        seq_len = text_length + num_image_embeddings
+
+        # CP expects sequence length is divisible by CP size so apply padding.
+        mp_padding_needed = context_parallel.get_padding(
+            seq_len, args.context_parallel_size,
+            args.tensor_model_parallel_size, args.sequence_parallel,
+        )
+        tokens, position_ids, labels, loss_mask = [torch.nn.functional.pad(item, (0, mp_padding_needed)) for item in (tokens, position_ids, labels, loss_mask)]
+
+        # Get PackedSeqParams that indicate the amount of padding for TransformerEngine.
+        packed_seq_params = context_parallel.get_packed_seq_params(tokens, num_image_embeddings, mp_padding_needed, args.context_parallel_size, True)
+
+    return (
+        tokens,
+        labels,
+        loss_mask,
+        attention_mask,
+        position_ids,
+        imgs,
+        num_tiles,
+        packed_seq_params,
+    )
+
+
+def get_ltor_masks_and_position_ids(input_ids, target, pad_token):
+    """Build masks and position id for left to right model."""
+    seq_length = input_ids.shape[1]
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+    # Loss mask.
+    loss_mask = torch.ones(target.size(), dtype=torch.float, device=input_ids.device)
+    loss_mask[target == pad_token] = 0.0  # mask paddings
+    loss_mask[target == IGNORE_INDEX] = 0.0  # mask prompts
+
+    return loss_mask, position_ids
+
+
+def get_mask_start_and_end_idx(arr):
+    """
+    Returns a list of tuples holding the start and end index in arr of the non-zeros contiguuous
+    sub arrays.
+
+    For instance, if arr = [0, 1, 0, 0, 1, 1]
+    get_mask_start_and_end_idx(arr) = [(1, 1), (4, 5)]
+    such that arr[1:1+1] = [1] and arr[4:5+1] = [1, 1]
+    """
+    mask = (arr != 0)
+
+    mask_int = mask.int()
+
+    diff = mask_int[1:] - mask_int[:-1]
+    start_indices = (diff == 1).nonzero(as_tuple=False).flatten() + 1
+    end_indices = (diff == -1).nonzero(as_tuple=False).flatten()
+    if len(mask)==0: return []
+    if mask[0]:
+        start_indices = torch.cat((torch.tensor([0], device=arr.device), start_indices))
+    if mask[-1]:
+        end_indices = torch.cat((end_indices, torch.tensor([len(arr) - 1], device=arr.device)))
+    sequences = list(zip(start_indices.tolist(), end_indices.tolist()))
+    return sequences
+
+
+def scaled_loss_func(loss_mask, output_tensor):
+    """
+    Scaled loss function
+
+    Scale the loss for each conversation turn using the formula:
+
+    1 / sum_j[ sqrt(length(loss_turn_j)) ] * sum_i[ sum(loss_turn_i) / sqrt(length(loss_turn_i)) ]
+
+    Where we use the loss mask to infer the start / end of the conversation turns.
+    """
+    args = get_args()
+    losses = output_tensor.float()
+
+    loss_list = []
+    num_valid_labels_list = []
+    for idx in range(losses.shape[0]):
+        loss_this_sample = losses[idx]
+        turn_start_end_list = get_mask_start_and_end_idx(loss_mask[idx])
+        for turn_start, turn_end in turn_start_end_list:
+            # compute loss for each turn
+            loss_this_turn = loss_this_sample[turn_start:turn_end+1].sum()
+            assert (1 - loss_mask)[idx][turn_start:turn_end+1].sum() < 1.0
+            num_valid_labels_this_turn = turn_end - turn_start + 1
+            loss_this_turn = loss_this_turn / num_valid_labels_this_turn
+            loss_list.append(loss_this_turn)
+            # append num of valid labels for each turn
+            num_valid_labels_list.append(num_valid_labels_this_turn)
+    base_num = sum([math.sqrt(each) for each in num_valid_labels_list])
+    for idx in range(len(loss_list)):
+        # normalize loss for each turn
+        loss_list[idx] = loss_list[idx] * math.sqrt(num_valid_labels_list[idx]) / base_num
+
+    # Some ranks may not get loss tokens due to Context Parallel Sharding
+    if len(loss_list) > 0:
+        total_loss = torch.stack(loss_list).sum()
+        total_tokens = torch.ones_like(total_loss)
+    elif len(loss_list) == 0 and args.context_parallel_size > 1:
+        total_tokens = loss_mask.sum()
+        total_loss = torch.sum(losses.view(-1) * loss_mask)
+    else:
+        raise RuntimeError("loss_list for loss scaling per conversation unexpectedly got empty list")
+
+    num_tokens = total_tokens.clone().detach().to(torch.int)
+    reporting_loss = torch.cat([total_loss.clone().detach().view(1), num_tokens.view(1)])
+
+    return (total_loss, num_tokens, {'lm loss': reporting_loss})
+
+
+def loss_func(loss_mask, output_tensor):
+    args = get_args()
+
+    losses = output_tensor.view(-1).float()
+    loss_mask = loss_mask.contiguous().view(-1).float()
+    loss = torch.sum(losses * loss_mask)
+
+    num_tokens = loss_mask.sum().clone().detach().to(torch.int)
+    reporting_loss = torch.cat([loss.clone().detach().view(1), num_tokens.view(1)])
+
+    return (loss, num_tokens, {'lm loss': reporting_loss})
+
+
+def forward_step(data_iterator, model: LLaVAModel):
+    """Forward training step.
+
+    Args:
+        data_iterator (torch.utils.data.dataloader): Input data iterator
+        model: Multimodal model
+
+    Returns:
+        output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
+        loss_func (callable): Loss function with a loss mask specified.
+    """
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    (
+        tokens,
+        labels,
+        loss_mask,
+        attention_mask,
+        position_ids,
+        images,
+        num_image_tiles,
+        packed_seq_params,
+    ) = get_batch(data_iterator, model.module.module.image_token_index, model.module.module.img_seq_len)
+    timers('batch-generator').stop()
+
+    output_tensor, loss_mask = model(
+        images,
+        tokens,
+        position_ids,
+        attention_mask,
+        labels,
+        loss_mask,
+        num_image_tiles=num_image_tiles,
+        packed_seq_params=packed_seq_params,
+    )
+    args = get_args()
+    if args.use_loss_scaling:
+        loss_function = partial(scaled_loss_func, loss_mask)
+    else:
+        loss_function = partial(loss_func, loss_mask)
+
+    return output_tensor, loss_function
+
+
+def llava_embedding_ranks(pp_ranks):
+    """LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings).
+    Args:
+        pp_ranks: A list of global ranks that constitute a pipeline group.
+    """
+    args = get_args()
+
+    # encoder size is also the index to the first rank of the decoder.
+    epp = args.encoder_pipeline_model_parallel_size
+
+    last_rank = pp_ranks[-1]
+    if len(pp_ranks) == 1 or pp_ranks[epp] == last_rank:
+        return [last_rank]
+    else:
+        return [pp_ranks[epp], last_rank]
+
+
+def llava_position_embedding_ranks(pp_ranks):
+    """LLava's embedding ranks consist of the singular rank of the model or the decoder's first rank.
+    Args:
+        pp_ranks: A list of global ranks that constitute a pipeline group.
+    """
+    args = get_args()
+
+    # encoder size is also the index to the first rank of the decoder.
+    epp = args.encoder_pipeline_model_parallel_size
+
+    last_rank = pp_ranks[-1]
+    if len(pp_ranks) == 1:
+        return [last_rank]
+    else:
+        return [pp_ranks[epp]]
+
+
+def run_online_eval(model):
+    """Run an evaluation benchmark during training."""
+    args = get_args()
+
+    # Online evaluation config is not defined. Do nothing.
+    if not args.online_evaluation_config:
+        return []
+
+    from config import EvaluationConfig
+    # Import the common evaluation functions
+    from run_text_generation import get_evaluation_configs, run_evaluation_loop
+
+    # Use the common config loading function
+    configs = get_evaluation_configs(config_path=args.online_evaluation_config)
+
+    # The inference code assumes the first rank is the leader.
+    # Tensorboard writer is on the last rank.
+    # We must write to a storage space that all ranks see.
+    output_dir = os.path.join(args.save, "online_eval")
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Use the common evaluation loop
+    scores = run_evaluation_loop(model[0].module, configs, output_dir_override=output_dir, print_output=False)
+
+    return [scores]
+
+
+def write_eval_to_tensorboard(data, iteration, writer, walltime=None):
+    """Write evaluation data to Tensorboard."""
+    if not writer:
+        return
+
+    for item in data:
+        for k, v in item.items():
+            writer.add_scalar(k, v, iteration, walltime=walltime)
+
+
+def write_online_eval_to_tensorboard(data, iteration, writer, walltime=None):
+    """Write online evaluation data to Tensorboard."""
+    import shutil
+    args = get_args()
+
+    # Define source and destination directories
+    source_dir = os.path.join(args.save, "online_eval")
+    destination_dir = os.path.join(args.save, f"online_eval_{iteration}")
+    if os.path.exists(source_dir):
+        print("Moving online eval data from", source_dir, "to", destination_dir)
+
+        # Move the directory (back up the generation)
+        shutil.move(source_dir, destination_dir)
+
+    write_eval_to_tensorboard(data, iteration, writer, walltime)
+
+
+if __name__ == "__main__":
+
+    train_valid_test_dataloaders_provider.is_distributed = True
+
+    pretrain(
+        train_valid_test_dataloaders_provider,
+        model_provider,
+        ModelType.encoder_and_decoder,
+        forward_step,
+        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+        extra_args_provider=add_multimodal_extra_args,
+        process_non_loss_data_func=write_online_eval_to_tensorboard,
+        get_embedding_ranks=llava_embedding_ranks,
+        get_position_embedding_ranks=llava_position_embedding_ranks,
+        non_loss_data_func=run_online_eval,
+    )
--- a/Megatron-LM/examples/post_training/modelopt/README.md
+++ b/Megatron-LM/examples/post_training/modelopt/README.md
+# NVIDIA TensorRT Model Optimizer (ModelOpt) Integration
+
+ModelOpt (`nvidia-modelopt`) provides end-to-end model optimization for NVIDIA hardware including
+quantization, sparsity, knowledge distillation, pruning, neural architecture search.
+You can find more info abour ModelOpt at our Github repository https://github.com/NVIDIA/TensorRT-Model-Optimizer.
+
+We support Megatron Core `GPTModel` and `MambaModel` as well as task-specific optimization
+such as speculative decoding. Users can choose to start from Megatron-LM or NeMo framework.
+The optimized model can be deploied with  NVIDIA TensorRT-LLM, vLLM, or SGLang.
+
+## Table of Contents
+
+[[_TOC_]]
+
+
+## Getting Started with Post-Training Quantization (
+
+> **IMPORTANT :** Example scripts require basic access (general available) to
+> NVIDIA GPU Cloud (NGC). If you have yet to register and acquire a `NGC_CLI_API_KEY`, 
+> please first register at https://ngc.nvidia.com/signin. 
+
+Login to nvcr.io docker registry (using `NGC_CLI_API_KEY`) and start an interactive
+section **at the root of the megatron-lm repo!** Export your `NGC_CLI_API_KEY` in the environment.
+```sh
+docker login nvcr.io
+
+docker run --gpus all --init -it --rm -v $PWD:/workspace/megatron-lm \
+    nvcr.io/nvidia/pytorch:24.10-py3 bash
+cd /workspace/megatron-lm/examples/post_training/modelopt
+
+export NGC_CLI_API_KEY=
+```
+
+Now let's start a simple FP8 quantization task. You must provide `HF_TOKEN` which grants you
+access to `meta-llama/Llama-3.2-1B-Instruct`.
+```sh
+export HF_TOKEN=
+bash convert.sh meta-llama/Llama-3.2-1B-Instruct
+MLM_MODEL_CKPT=/tmp/megatron_workspace/meta-llama/Llama-3.2-1B-Instruct_mlm bash quantize.sh meta-llama/Llama-3.2-1B-Instruct fp8
+```
+The model card name (see the support list in `conf/`) is expected as an input to all the sample scripts.
+Other arguments are specified as varibles (e.g. `TP=8`) where you can either set before `bash` or export
+to the current bash environment upfront.
+
+The script will perform per-tensor FP8 faked-quantization and generate some tokens as an indication thatthe quantized model still behaves correctly. The end results are stored in `/tmp/megatron_workspace/meta-llama/Llama-3.2-1B-Instruct_quant`. This is a Megatron Mcore distributed checkpoint (with additional states), which can be loaded for quantization-aware training (QAT) or exported for deployment.
+
+## Export for TensorRT-LLM, vLLM, SGLang Deployment 
+
+For supported Hugging Face models, TensorRT Model Optimizer can export the quantized model to
+a  HF-like checkpoint with real-quantied weights.
+```sh
+MLM_MODEL_CKPT=/tmp/megatron_workspace/meta-llama/Llama-3.2-1B-Instruct_quant bash export.sh meta-llama/Llama-3.2-1B-Instruct
+```
+> **NOTE:** The HF-like export only supports pipeline parallelism (`PP`). Other parallelism must be
+> set to 1. The exported checkpoint is sharded with safetensors. Although it is HF-like, this format
+> currently cannot be loaded by `from_pretrained()`.
+The exported checkpoint is stored in `/tmp/megatron_workspace/meta-llama/Llama-3.1-8B-Instruct_export` which can be provided as an input to most of the `LLM` APIs. For examples,
+```
+vllm serve /tmp/megatron_workspace/meta-llama/Llama-3.1-8B-Instruct_export --quantization modelopt
+```
+> **TROUBLESHOOTING:** You need a device with `sm>=89` (Ada Lovelace or Hopper) for FP8 compute.
+
+
+## Advanced Usage
+TBD
--- a/Megatron-LM/examples/post_training/modelopt/conf/arguments.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/arguments.sh
+MLM_MODEL_CFG=$1
+
+# Bash coloring
+RED='\033[0;31m'
+YELLOW='\033[0;33m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+PURPLE='\033[0;35m'
+WHITE='\033[0;37m'
+
+# Predefined logging
+MLM_ERROR="${RED}ERROR:  ${WHITE}"
+MLM_WARNING="${YELLOW}WARNING:${WHITE}"
+
+if [ -z ${SANDBOX_ENV_SETUP} ]; then
+    printf "${MLM_WARNING} ${PURPLE}SANDBOX_ENV_SETUP${WHITE} is not set!\n"
+else
+    source ${SANDBOX_ENV_SETUP}
+fi
+
+if [ -z ${SCRIPT_DIR} ]; then
+    printf "${MLM_ERROR} Variable ${PURPLE}SCRIPT_DIR${WHITE} must be set!\n"
+    exit 1
+fi
+
+if [ -z ${MLM_MODEL_CFG} ]; then
+    printf "${MLM_ERROR} Variable ${PURPLE}MLM_MODEL_CFG${WHITE} must be set!\n"
+    exit 1
+fi
+
+if [ -z ${MLM_ENV_SETUP} ]; then
+    printf "${MLM_WARNING} Variable ${PURPLE}MLM_ENV_SETUP${WHITE} not set! (only needed when launching with slurm)\n"
+else
+    source ${MLM_ENV_SETUP}
+fi
+
+if [ -z ${MLM_EXTRA_ARGS} ]; then
+    printf "${MLM_WARNING} Use ${PURPLE}MLM_EXTRA_ARGS${WHITE} to provide additional arguments!\n"
+fi
+
+if [ -z ${MLM_WORK_DIR} ]; then
+    export  MLM_WORK_DIR=/tmp/megatron_workspace
+    printf "${MLM_WARNING} Variable ${PURPLE}MLM_WORK_DIR${WHITE} is set (default: ${MLM_WORK_DIR})!\n"
+fi
+
+if [ -z ${TP} ]; then
+    TP=1
+    printf "${MLM_WARNING} Variable ${PURPLE}TP${WHITE} not set! (default: ${TP})\n"
+fi
+
+if [ -z ${EP} ]; then
+    EP=1
+    printf "${MLM_WARNING} Variable ${PURPLE}EP${WHITE} not set! (default: ${EP})\n"
+fi
+
+if [ -z ${PP} ]; then
+    PP=1
+    printf "${MLM_WARNING} Variable ${PURPLE}PP${WHITE} not set! (default: ${PP})\n"
+fi
+
+if [ -z ${DP} ]; then
+    DP=1
+    printf "${MLM_WARNING} Variable ${PURPLE}DP${WHITE} not set! (default: ${DP})\n"
+fi
+
+
+if [ -z ${LAUNCH_SCRIPT} ]; then
+    LAUNCH_SCRIPT="torchrun --nproc_per_node=$((TP * EP * PP * DP))"
+fi
+
+# Install TensorRT Model Optimizer if haven't.
+if [ -z ${MLM_SKIP_INSTALL} ]; then
+    pip install -r ${SCRIPT_DIR}/requirements.txt
+fi
+
+export TOKENIZERS_PARALLELISM=False
+export OMP_NUM_THREADS=1
+export NCCL_IB_SL=1
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# TE specific warning
+printf "${MLM_WARNING} If you see core_attention  _extra_state missing error, use --export-force-local-attention\n"
+
+# Base model specific arguments
+if [ -z ${SANDBOX_ROOT} ]; then
+    source "${SCRIPT_DIR}/conf/${MLM_MODEL_CFG}.sh"
+else
+    source "${SANDBOX_ROOT}/conf/model/${MLM_MODEL_CFG}.sh"
+fi
--- a/Megatron-LM/examples/post_training/modelopt/conf/deepseek-ai/DeepSeek-R1.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/deepseek-ai/DeepSeek-R1.sh
+#!/bin/bash
+
+TOKENIZER_MODEL="deepseek-ai/DeepSeek-R1"
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --rotary-percent 1.0 \
+    --no-rope-fusion \
+    --no-position-embedding \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-layers 61 \
+    --hidden-size 7168 \
+    --ffn-hidden-size 18432 \
+    --num-attention-heads 128 \
+    --kv-channels 128 \
+    --multi-latent-attention \
+    --kv-lora-rank 512 \
+    --q-lora-rank 1536 \
+    --qk-head-dim 128 \
+    --qk-layernorm \
+    --qk-pos-emb-head-dim 64 \
+    --num-experts 256 \
+    --moe-layer-freq [0]*3+[1]*58 \
+    --moe-ffn-hidden-size 2048 \
+    --moe-router-score-function sigmoid \
+    --moe-router-bias-update-rate 0.001 \
+    --moe-router-enable-expert-bias \
+    --moe-router-topk 8 \
+    --moe-router-pre-softmax \
+    --moe-router-topk-scaling-factor 2.5 \
+    --moe-shared-expert-overlap \
+    --moe-shared-expert-intermediate-size 2048 \
+    --moe-aux-loss-coeff 1e-2 \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --moe-token-dispatcher-type alltoall \
+    --moe-token-drop-policy probs \
+    --seq-length 4096 \
+    --max-position-embeddings 163840 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 40 \
+    --use-mcore-models \
+    --rotary-base 10000 \
+    --rotary-percent 1.0 \
+    --rotary-scaling-factor 40 \
+    --mscale 1.0 \
+    --mscale-all-dim 1.0 \
+    --recompute-activations \
+    --moe-layer-recompute \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/deepseek-ai/DeepSeek-V2-Lite.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/deepseek-ai/DeepSeek-V2-Lite.sh
+#!/bin/bash
+
+TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite"
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --norm-epsilon 1e-6 \
+    --swiglu \
+    --num-layers 27 \
+    --hidden-size 2048 \
+    --ffn-hidden-size 10944 \
+    --num-attention-heads 16 \
+    --kv-channels 16 \
+    --multi-latent-attention \
+    --kv-lora-rank 512 \
+    --v-head-dim 128 \
+    --qk-head-dim 128 \
+    --qk-layernorm \
+    --qk-pos-emb-head-dim 64 \
+    --num-experts 64 \
+    --moe-layer-freq ([0]+[1]*26) \
+    --moe-ffn-hidden-size 1408 \
+    --moe-grouped-gemm \
+    --moe-router-score-function softmax \
+    --moe-router-topk 6 \
+    --moe-router-topk-scaling-factor 1.0 \
+    --moe-router-pre-softmax \
+    --moe-shared-expert-intermediate-size 2816 \
+    --moe-aux-loss-coeff 1e-3 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-token-drop-policy probs \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 3200 \
+    --attention-softmax-in-fp32 \
+    --use-mcore-models \
+    --rotary-percent 1.0 \
+    --rotary-base 10000 \
+    --rotary-scaling-factor 40 \
+    --mscale 0.707 \
+    --mscale-all-dim 0.707 \
+    --sequence-parallel \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/meta-llama/Llama-3.1-8B-Instruct.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/meta-llama/Llama-3.1-8B-Instruct.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=meta-llama/Llama-3.1-8B-Instruct
+    TOKENIZER_MODEL=nvidia/Llama-3.1-70B-Instruct-FP8
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --rotary-percent 1.0 \
+    --no-rope-fusion \
+    --no-position-embedding \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --seq-length 4096 \
+    --max-position-embeddings 8192 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 1 \
+    --use-mcore-models \
+    --rotary-base 500000 \
+    --use-rope-scaling \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/meta-llama/Llama-3.2-1B-Instruct.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/meta-llama/Llama-3.2-1B-Instruct.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=meta-llama/Llama-3.2-1B-Instruct
+    TOKENIZER_MODEL=nvidia/Llama-3.1-70B-Instruct-FP8
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --use-rotary-position-embeddings \
+    --no-rope-fusion \
+    --no-position-embedding \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-layers 16 \
+    --hidden-size 2048 \
+    --ffn-hidden-size 8192 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --seq-length 4096 \
+    --max-position-embeddings 8192 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 1 \
+    --use-mcore-models \
+    --rotary-percent 1.0 \
+    --rotary-base 500000 \
+    --use-rope-scaling \
+    --export-force-local-attention \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/meta-llama/Llama-4-Maverick-17B-128E-Instruct.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/meta-llama/Llama-4-Maverick-17B-128E-Instruct.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=meta-llama/Llama-4-Maverick-17B-128E-Instruct
+    TOKENIZER_MODEL=meta-llama/Llama-4-Maverick-17B-128E-Instruct
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --recompute-activations \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-layers 48 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 16384 \
+    --num-attention-heads 40 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --num-experts 128 \
+    --moe-layer-freq ([0,1]*24) \
+    --moe-layer-recompute \
+    --moe-ffn-hidden-size 8192 \
+    --moe-router-score-function sigmoid \
+    --moe-router-topk 1 \
+    --moe-router-topk-scaling-factor 1.0 \
+    --moe-router-dtype fp32 \
+    --moe-shared-expert-intermediate-size 8192 \
+    --moe-aux-loss-coeff 1e-3 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-token-drop-policy probs \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --seq-length 2048 \
+    --max-position-embeddings 2048 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 1 \
+    --use-mcore-models \
+    --rotary-percent 1.0 \
+    --rope-scaling-factor 8.0 \
+    --rotary-base 500000 \
+    --rotary-interleaved \
+    --no-rope-freq 4 \
+    --export-moe-apply-probs-on-input \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/meta-llama/Llama-4-Scout-17B-16E-Instruct.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/meta-llama/Llama-4-Scout-17B-16E-Instruct.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=meta-llama/Llama-4-Scout-17B-16E-Instruct
+    TOKENIZER_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-layers 48 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 16384 \
+    --num-attention-heads 40 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --qk-layernorm \
+    --num-experts 16 \
+    --moe-ffn-hidden-size 8192 \
+    --moe-router-score-function sigmoid \
+    --moe-router-topk 1 \
+    --moe-router-topk-scaling-factor 1.0 \
+    --moe-router-dtype fp32 \
+    --moe-shared-expert-intermediate-size 8192 \
+    --moe-aux-loss-coeff 1e-3 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-token-drop-policy probs \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 128 \
+    --use-mcore-models \
+    --rotary-interleaved \
+    --rotary-percent 1.0 \
+    --rotary-base 500000 \
+    --rope-scaling-factor 8.0 \
+    --use-rope-scaling \
+    --sequence-parallel \
+    --no-bias-swiglu-fusion \
+    --export-qk-l2-norm \
+    --export-moe-apply-probs-on-input \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/nvidia/Nemotron-H-4B-Instruct.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/nvidia/Nemotron-H-4B-Instruct.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=nvidia/Nemotron-H-4B-Instruct
+    TOKENIZER_MODEL=nvidia/Nemotron-H-4B-Instruct
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --rotary-percent 0.5 \
+    --no-rope-fusion \
+    --no-position-embedding \
+    --normalization RMSNorm \
+    --squared-relu \
+    --num-layers 52 \
+    --hidden-size 3072 \
+    --ffn-hidden-size 12288 \
+    --kv-channels 128 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
+    --mamba-head-dim 64 \
+    --mamba-num-heads 112 \
+    --mamba-num-groups 8 \
+    --mamba-state-dim 128 \
+    --seq-length 4096 \
+    --max-position-embeddings 8192 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 1 \
+    --use-mcore-models \
+    --rotary-base 10000 \
+    --export-model-type MambaModel \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/nvidia/Nemotron-H-8B-Base-8K.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/nvidia/Nemotron-H-8B-Base-8K.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=nvidia/Nemotron-H-8B-Base-8K
+    TOKENIZER_MODEL=nvidia/Nemotron-H-8B-Base-8K
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --no-rope-fusion \
+    --no-position-embedding \
+    --normalization RMSNorm \
+    --squared-relu \
+    --num-layers 52 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 21504 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
+    --is-hybrid-model \
+    --mamba-head-dim 64 \
+    --mamba-num-heads 128 \
+    --mamba-num-groups 8 \
+    --mamba-state-dim 128 \
+    --seq-length 4096 \
+    --max-position-embeddings 8192 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 1 \
+    --use-mcore-models \
+    --rotary-percent 0.5 \
+    --rotary-base 500000 \
+    --export-model-type MambaModel \
+"
+#    --rotary-base 10000 \
--- a/Megatron-LM/examples/post_training/modelopt/conf/nvidia/Nemotron-Mini-4B-Instruct.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/nvidia/Nemotron-Mini-4B-Instruct.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=nvidia/Nemotron-Mini-4B-Instruct
+    TOKENIZER_MODEL=nvidia/Nemotron-Mini-4B-Instruct
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --rotary-percent 0.5 \
+    --no-rope-fusion \
+    --no-position-embedding \
+    --normalization LayerNorm \
+    --apply-layernorm-1p \
+    --squared-relu \
+    --num-layers 32 \
+    --hidden-size 3072 \
+    --ffn-hidden-size 9216 \
+    --num-attention-heads 24 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 1 \
+    --use-mcore-models \
+    --rotary-base 10000 \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/qwen/Qwen3-235B-A22B.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/qwen/Qwen3-235B-A22B.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=Qwen/Qwen3-235B-A22B
+    TOKENIZER_MODEL=Qwen/Qwen3-235B-A22B
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-layers 94 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 12288 \
+    --num-attention-heads 64 \
+    --group-query-attention \
+    --num-query-groups 4 \
+    --kv-channels 128 \
+    --qk-layernorm \
+    --num-experts 128 \
+    --moe-ffn-hidden-size 1536 \
+    --moe-router-topk 8 \
+    --moe-router-dtype fp32 \
+    --moe-aux-loss-coeff 1e-3 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-load-balancing-type aux_loss \
+    --moe-layer-recompute \
+    --seq-length 4096 \
+    --max-position-embeddings 40960 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 1187 \
+    --use-mcore-models \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --no-bias-swiglu-fusion \
+    --sequence-parallel \
+"
--- a/Megatron-LM/examples/post_training/modelopt/conf/qwen/Qwen3-30B-A3B.sh
+++ b/Megatron-LM/examples/post_training/modelopt/conf/qwen/Qwen3-30B-A3B.sh
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=Qwen/Qwen3-30B-A3B
+    TOKENIZER_MODEL=Qwen/Qwen3-30B-A3B
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-layers 48 \
+    --hidden-size 2048 \
+    --ffn-hidden-size 6144 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 4 \
+    --kv-channels 128 \
+    --qk-layernorm \
+    --num-experts 128 \
+    --moe-ffn-hidden-size 768 \
+    --moe-router-topk 8 \
+    --moe-router-dtype fp32 \
+    --moe-aux-loss-coeff 1e-3 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-load-balancing-type aux_loss \
+    --moe-layer-recompute \
+    --seq-length 4096 \
+    --max-position-embeddings 40960 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 1187 \
+    --use-mcore-models \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --no-bias-swiglu-fusion \
+    --sequence-parallel \
+"
--- a/Megatron-LM/examples/post_training/modelopt/convert.sh
+++ b/Megatron-LM/examples/post_training/modelopt/convert.sh
+#!/bin/bash
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+# Common arguments and base model specific arguments
+source "${SCRIPT_DIR}/conf/arguments.sh"
+
+# Default arguments of this script
+MLM_DEFAULT_ARGS="--finetune --auto-detect-ckpt-format --export-te-mcore-model --use-cpu-initialization"
+
+
+if [ -z ${HF_TOKEN} ]; then
+    printf "${MLM_WARNING} Variable ${PURPLE}HF_TOKEN${WHITE} is not set! HF snapshot download may fail!\n"
+fi
+
+if [ -z ${MLM_MODEL_SAVE} ]; then
+    MLM_MODEL_SAVE=${MLM_WORK_DIR}/${MLM_MODEL_CFG}_mlm
+    printf "${MLM_WARNING} Variable ${PURPLE}MLM_MODEL_SAVE${WHITE} is not set (default: ${MLM_MODEL_SAVE})!\n"
+fi
+
+
+if [ -z ${MLM_MODEL_CKPT} ]; then
+    if [ -z ${HF_MODEL_CKPT} ]; then
+        HF_MODEL_CKPT=${1}
+    fi
+    ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/convert_model.py \
+        ${MODEL_ARGS} \
+        --tensor-model-parallel-size ${TP} \
+        --pipeline-model-parallel-size ${PP} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --pretrained-model-path ${HF_MODEL_CKPT} \
+        --save ${MLM_MODEL_SAVE} \
+        ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS}
+else
+    ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/convert_model.py \
+        ${MODEL_ARGS} \
+        --tensor-model-parallel-size ${TP} \
+        --pipeline-model-parallel-size ${PP} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --load ${MLM_MODEL_CKPT} \
+        --save ${MLM_MODEL_SAVE} \
+        ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS}
+fi
--- a/Megatron-LM/examples/post_training/modelopt/convert_model.py
+++ b/Megatron-LM/examples/post_training/modelopt/convert_model.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+"""Convert a GPTModel."""
+import functools
+import os
+import sys
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
+
+import modelopt.torch.speculative as mtsp
+import torch
+from modelopt.torch.export import import_mcore_gpt_from_hf
+
+from megatron.core import mpu
+from megatron.core.enums import ModelType
+from megatron.post_training.arguments import add_modelopt_args
+from megatron.post_training.checkpointing import load_modelopt_checkpoint
+from megatron.post_training.model_provider import model_provider
+from megatron.training import get_args  # , get_model
+from megatron.training.checkpointing import save_checkpoint
+from megatron.training.initialize import initialize_megatron
+from megatron.training.utils import print_rank_0, unwrap_model
+
+ALGO_TO_CONFIG = {
+    "eagle1": mtsp.config.EAGLE1_DEFAULT_CFG,
+    "eagle3": mtsp.config.EAGLE3_DEFAULT_CFG,
+    "eagle-mtp": mtsp.config.EAGLE_MTP_DEFAULT_CFG,
+}
+
+
+def add_convert_args(parser):
+    """Add additional arguments for ModelOpt checkpoint convertion."""
+    group = parser.add_argument_group(title='ModelOpt MCore checkpoint convertion')
+    group.add_argument(
+        "--pretrained-model-path", type=str, default=None, help="HuggingFace pretrained model"
+    )
+    group.add_argument(
+        "--extra-model-path", type=str, default=None, help="Extra module weights to load"
+    )
+    add_modelopt_args(parser)
+    return parser
+
+
+def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
+    """Build the model."""
+    args = get_args()
+    args.model_type = model_type
+    pre_process = mpu.is_pipeline_first_stage()
+    post_process = mpu.is_pipeline_last_stage()
+    model = model_provider_func(pre_process=pre_process, post_process=post_process)
+    model.model_type = model_type
+    return [model]
+
+
+def check_arguments():
+    """Checking user arguments."""
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+
+    if hasattr(args, 'moe_grouped_gemm') and args.moe_grouped_gemm == True:
+        print_rank_0("WARNING: Forcing moe_grouped_gemm to False for PTQ and export.")
+        args.moe_grouped_gemm = False
+
+
+if __name__ == "__main__":
+    initialize_megatron(
+        extra_args_provider=add_convert_args,
+        args_defaults={
+            'tokenizer_type': 'HuggingFaceTokenizer',
+            'no_load_rng': True,
+            'no_load_optim': True,
+        },
+    )
+    check_arguments()
+
+    args = get_args()
+
+    model = get_model(functools.partial(model_provider, parallel_output=True), wrap_with_ddp=False)
+
+    unwrapped_model = unwrap_model(model)[0]
+
+    if args.pretrained_model_path is not None:
+        unwrapped_model = unwrap_model(model)[0]
+        workspace_dir = os.environ.get("MLM_WORK_DIR", "/tmp")
+        import_mcore_gpt_from_hf(unwrapped_model, args.pretrained_model_path, workspace_dir)
+    elif args.load is not None:
+        _ = load_modelopt_checkpoint(model)
+
+    if args.export_num_eagle_layers > 0:
+        mtsp_config = ALGO_TO_CONFIG[args.export_eagle_algorithm]
+        mtsp_config["config"]["draft_vocab_size"] = args.export_draft_vocab_size
+
+        unwrapped_model = mtsp.convert(unwrapped_model, mtsp_config)
+
+        if args.extra_model_path is not None:
+            eagle_module = getattr(unwrapped_model, "eagle_module", None)
+            if eagle_module is not None:
+                mcore_eagle_state_dict = torch.load(args.extra_model_path)
+                eagle_module.load_state_dict(mcore_eagle_state_dict, strict=False)
+
+    if args.export_num_medusa_heads > 0:
+        config = {"medusa_num_heads": args.export_num_medusa_heads, "medusa_num_layers": 1}
+        unwrapped_model = mtsp.convert(unwrapped_model, [("medusa", config)])
+
+    if args.export_num_mtp > 0:
+        config = {
+            "mtp_num_module": args.export_num_mtp,
+            "mtp_num_layers": 1,
+            "mtp_freeze_list": args.export_freeze_mtp,
+            "use_last_layernorm": False,
+        }
+        unwrapped_model = mtsp.convert(unwrapped_model, [("mtp", config)])
+
+    print_rank_0(f"Converted Model:\n {model}")
+    torch.distributed.barrier()
+
+    save_checkpoint(1, model, None, None, 0)