更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/examples/multimodal/model_converter/radio_converter.py
+++ b/examples/multimodal/model_converter/radio_converter.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+import torch
+def convert(output_path, tensor_parallel_size, use_te, version):
+    device = "cuda"
+    model = torch.hub.load('NVlabs/RADIO', 'radio_model', version=version, progress=True)
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+    # Indices from mapping pytorch multihead attention to megatron.
+    kv_channels = 80 
+    hidden_dim = 1280 
+    num_heads = 16
+    indices = []
+    for i in range(num_heads):
+        lb = i * kv_channels
+        ub = (i + 1) * kv_channels
+        indices.append(torch.arange(lb, ub, dtype=torch.int))
+        indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int))
+        indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int))
+    indices = torch.cat(indices)
+    for name, tensor in state_dict.items():
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+        if new_tensor.dtype == torch.float16:
+            new_tensor = new_tensor.to(torch.float32)
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+        if "summary_idxs" in name:
+            continue
+        elif "patch_generator" in name:
+            if "embedder" in name:
+                new_name = "embedder.weight"
+                chunk_dim = 0
+            elif "cls_token" in name:
+                new_name = "class_token"
+            elif "pos_embed" in name:
+                new_name = "position_embeddings"
+        elif "input_conditioner" in name:
+            continue
+        elif "blocks" in name:
+            layer_idx = name.split(".")[2]
+            base = f"decoder.layers.{layer_idx}"
+            if "attn.qkv.weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.qkv.bias" in name:
+                new_name = f"{base}.self_attention.linear_qkv.bias"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                chunk_dim = 1
+            elif "attn.proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "norm1.weight" in name:
+                new_name = f"{base}.input_layernorm.weight"
+                if use_te:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight"
+            elif "norm1.bias" in name:
+                new_name = f"{base}.input_layernorm.bias"
+                if use_te:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias"
+            elif "mlp.fc1.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.fc1.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.fc2.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.fc2.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "norm2.weight" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+                if use_te:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_weight"
+            elif "norm2.bias" in name:
+                new_name = f"{base}.pre_mlp_layernorm.bias"
+                if use_te:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_bias"
+        assert new_name != "", f"unexpected layer name {name}"
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+        for i in range(tensor_parallel_size):
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+            # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+            extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+            is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+            if use_te and is_extra_state_layer:
+                layer = new_name.split(".")[-2]
+                if layer in extra_state_layers:
+                    extra_state_name = (
+                        new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                    )  # Replace the weight name.
+                    new_state_dicts[i]["model"][extra_state_name] = None
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
+        os.makedirs(output_dir_tp)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+    with open(os.path.join(output_path, "latest_checkpointed_iteration.txt"), "w") as f:
+        f.write("1") 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert RADIO weights to megatron format.
+Example usage:
+python radio_converter.py --output /some/output/folder --tensor-parallel-size 4
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
+    )
+    parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
+    parser.add_argument("--version", type=str, default="radio_v2.5-h", help="Version of radio to load for conversion")
+    args = parser.parse_args()
+    convert(args.output, args.tensor_parallel_size, args.use_te, args.version)
+    print("done.")
--- a/examples/multimodal/multimodal_args.py
+++ b/examples/multimodal/multimodal_args.py
@@ -10,7 +10,9 @@ def add_multimodal_extra_args(parser):
    group.add_argument('--freeze-LM', action='store_true', default=False)
    group.add_argument('--freeze-ViT', action='store_true', default=False)
    group.add_argument('--language-model-type', type=str, required=True)
+    group.add_argument('--language-huggingface-model-name-or-path', type=str)
    group.add_argument('--vision-model-type', type=str, default="clip")
+    group.add_argument('--vision-huggingface-model-name-or-path', type=str)
    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
    group.add_argument(
        "--allow-missing-vision-projection-checkpoint", action="store_true", default=False
@@ -49,7 +51,7 @@ def add_multimodal_extra_args(parser):
    group.add_argument(
        "--tokenizer-prompt-format",
        type=str,
-        choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
+        choices=["mistral", "llama3", "llama3p1", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
        required=True,
        help="Prompt format to use with the tokenizer.",
    )
@@ -74,6 +76,14 @@ def add_multimodal_extra_args(parser):
    group.add_argument(
        "--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
    )
+    group.add_argument(
+        "--use-loss-scaling", action="store_true", default=False, help="Scale loss based on conversation turn length (in tokens)."
+    )
+    group.add_argument(
+        "--use-area-weighted-aspect-ratio", action="store_true", default=False,
+        help=(
+            "When --use-tiling is True, find the aspect ratio to use based on the original ",
+            "image aspect ratio and the area covered by the tiles.")
+    )
    return parser
--- a/examples/multimodal/nvlm/internvit.py
+++ b/examples/multimodal/nvlm/internvit.py
@@ -11,11 +11,10 @@ Additionally, InternViT introduces some unique features like Layer Scaling.
 Those code changes are gathered here.
 """
 from functools import partial
-from typing import Dict
 import torch
-from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.utils import divide
 from megatron.core.extensions.transformer_engine import (
    TEColumnParallelLinear,
    TEDotProductAttention,
@@ -92,21 +91,28 @@ class InternViTRMSNorm(MegatronModule):
        return output
-    def _gather_var(self, input_, max_dim, valid_ranks=6):
+    def _gather_var(self, input_, max_dim):
        """Compute statistic across the non-dummy heads."""
        world_size = get_tensor_model_parallel_world_size()
-        assert world_size == 8, "tested only with TP=8"
        # Size and dimension.
        last_dim = input_.dim() - 1
        rank = get_tensor_model_parallel_rank()
-        if rank < valid_ranks:  # Ranks 0-5 have 24 non-dummy attention heads.
+        num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
+        valid_ranks = 24 // num_attention_heads_per_partition
+        residual_heads = 25 % num_attention_heads_per_partition
+        if residual_heads == 0:
+            residual_heads = num_attention_heads_per_partition
+        max_dim = max_dim * residual_heads
+        if rank < valid_ranks:  # Ranks without any dummy attention heads.
            var = input_.sum(-1, keepdim=True)
-        elif rank == valid_ranks:  # Rank 6 has 1 non-dummy attention head.
+        elif rank == valid_ranks:  # The only rank which may contain 'residual_heads' dummy attention heads.
            var = input_[..., :max_dim].sum(-1, keepdim=True)
        else:
-            var = input_.sum(-1, keepdim=True) * 0.0  # Zero-out the dummy heads.
+            var = input_.sum(-1, keepdim=True) * 0.0  # All heads in these ranks are dummy heads: Zero-out.
        tensor_list = [torch.empty_like(var) for _ in range(world_size)]
        tensor_list[rank] = var

--- a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
--- a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
@@ -101,6 +101,7 @@ OPTIONS=" \
    --init-method-std 0.014 \
    --attention-dropout ${AD} \
    --hidden-dropout ${HD} \
+    --untie-embeddings-and-output-weights \
    --eod-mask-loss \
    --bf16 \
    --tensorboard-dir=${TENSORBOARD_DIR} \

--- a/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
--- a/examples/multimodal/nvlm/run_text_generation_qwen25_7b_internvit_video.sh
+++ b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_internvit_video.sh
+#!/bin/bash
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export TOKENIZERS_PARALLELISM="false"
+INPUT_IMAGE_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        --input-metadata-path)
+            INPUT_METADATA_PATH="$2"
+            shift
+            shift
+            ;;
+        --num-frames)
+            NUM_FRAMES="$2"
+            shift
+            shift
+            ;;
+        -g|--groundtruth-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        --task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+SEQ_LEN=256
+DECODER_SEQ_LEN=16384
+EXTRA_ARGS=" --pixel-shuffle"
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --attention-softmax-in-fp32 \
+        --transformer-impl transformer_engine \
+        --use-te \
+        --use-checkpoint-args \
+        --normalization RMSNorm \
+        --norm-epsilon 1e-06 \
+        --language-model-type=qwen2.5_7B \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 1000000 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --tensor-model-parallel-size 4 \
+        --pipeline-model-parallel-size 1 \
+        --group-query-attention \
+        --num-query-groups 4 \
+        --num-layers 28 \
+        --hidden-size 3584 \
+        --ffn-hidden-size 18944 \
+        --add-qkv-bias \
+        --num-attention-heads 28 \
+        --max-position-embeddings 32768  \
+        --no-masked-softmax-fusion \
+        --load ${MODEL_PATH} \
+        --tokenizer-type MultimodalTokenizer \
+        --tokenizer-model Qwen/Qwen2.5-7B-Instruct \
+        --tokenizer-prompt-format qwen2p5 \
+        --bf16 \
+        --micro-batch-size 1 \
+        --seq-length ${SEQ_LEN} \
+        --decoder-seq-length ${DECODER_SEQ_LEN} \
+        --out-seq-length 128 \
+        --temperature 1.0 \
+        --img-h 448 \
+        --img-w 448 \
+        --patch-dim 14 \
+        --seed 153 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --task ${TASK} \
+        ${EXTRA_ARGS} \
+        --special-tokens "<image>" "<img>" "</img>" \
+        --vision-model-type internvit \
+        --num-frames ${NUM_FRAMES} \
+        --ckpt-format torch
+done
--- a/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh
+++ b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh
--- a/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh
--- a/examples/multimodal/nvlm/sft_34b_internvit.sh
+++ b/examples/multimodal/nvlm/sft_34b_internvit.sh
@@ -107,6 +107,7 @@ OPTIONS=" \
    --init-method-std 0.014 \
    --attention-dropout ${AD} \
    --hidden-dropout ${HD} \
+    --untie-embeddings-and-output-weights \
    --eod-mask-loss \
    --bf16 \
    --tensorboard-dir=${TENSORBOARD_DIR} \

--- a/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh
--- a/examples/multimodal/nvlm/sft_qwen2p5_7b_internvit_6b_video.sh
+++ b/examples/multimodal/nvlm/sft_qwen2p5_7b_internvit_6b_video.sh
+#!/bin/bash
+# Your SBATCH commands here if using SLURM.
+# Please launch this script from megatron-lm root.
+# Train a multimodal model.
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_ALGO=^NVLS
+export TOKENIZERS_PARALLELISM=false
+USER=$SLURM_JOB_USER
+# Auto-detect batch or interactive mode.
+which srun
+BATCH=$((1-$?))
+DEBUG=0
+if [[ $BATCH -eq 0 ]]; then
+    DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+    MODEL_NAME="qwen2.5-7B-internvit-video-sft-nvlm-${DATETIME}"
+else
+    MODEL_NAME="qwen2.5-7B-internvitp-video-sft-nvlm"
+    DEBUG=0
+fi
+WORKSPACE="<some dir>"
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+FINETUNE_DIR="${OUTPUT}/checkpoints"
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+# From pretraining. The pretraining checkpoint should have tensor parallel size to 4.
+LOAD_NAME="mcore-qwen2p5-7b-internvit-tp4"
+CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
+DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
+if [[ $DEBUG -eq 1 ]]; then
+    MBZ=1
+    BZ=1
+    NW=0
+    AD=0.0
+    HD=0.0
+    LI=1
+    # This is just for interactive testing purposes. Do not use for proper training.
+    EXTRA_ARGS="--freeze-LM"
+    ALLOW_NONDETERMINISTIC=1
+else
+    MBZ=1
+    BZ=256
+    NW=8
+    AD=0.0
+    HD=0.0
+    LI=5
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+fi
+USE_TILING=1
+SEQ_LEN=1024
+DECODER_SEQ_LEN=16384
+MAX_POS_EMBED=32768
+TRAIN_SAMPLES=6602173
+WARMUP_SAMPLES=198065
+if [[ $BATCH -eq 0 ]]; then
+    # Runs out of GPU memory in interactive memory without this.
+    EXTRA_ARGS+="--freeze-LM"
+fi
+if [[ $USE_TILING -eq 1 ]]; then
+    EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
+    SEQ_LEN=256
+fi
+OPTIONS=" \
+    --swiglu \
+    --use-distributed-optimizer \
+    --num-workers ${NW} \
+    --num-layers 28 \
+    --hidden-size 3584 \
+    --norm-epsilon 1e-06 \
+    --normalization RMSNorm \
+    --num-attention-heads 28 \
+    --exit-duration-in-mins 110 \
+    --group-query-attention \
+    --num-query-groups 4 \
+    --ffn-hidden-size 18944 \
+    --add-qkv-bias \
+    --seq-length ${SEQ_LEN} \
+    --decoder-seq-length ${DECODER_SEQ_LEN} \
+    --max-position-embeddings ${MAX_POS_EMBED} \
+    --dataloader-seq-length ${DECODER_SEQ_LEN} \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model Qwen/Qwen2.5-7B-Instruct \
+    --tokenizer-prompt-format qwen2p5 \
+    --pixel-shuffle \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --disable-bias-linear \
+    --pipeline-model-parallel-size 1 \
+    --tensor-model-parallel-size 4 \
+    --language-model-type qwen2.5_7B \
+    --vision-model-type internvit \
+    --micro-batch-size ${MBZ} \
+    --global-batch-size ${BZ} \
+    --lr 2e-6 \
+    --min-lr 2.5e-7 \
+    --train-samples ${TRAIN_SAMPLES} \
+    --lr-warmup-samples ${WARMUP_SAMPLES} \
+    --lr-decay-style cosine \
+    --clip-grad 10 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --attention-dropout ${AD} \
+    --hidden-dropout ${HD} \
+    --eod-mask-loss \
+    --bf16 \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --img-h 448 \
+    --img-w 448 \
+    --patch-dim 14 \
+    --data-path ${DATA_TRAIN} \
+    --dataloader-type external \
+    --split 100,0,0 \
+    --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
+    --log-interval ${LI} \
+    --save-interval 500 \
+    --eval-interval 500 \
+    --eval-iters 10 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    ${EXTRA_ARGS} \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --distributed-timeout-minutes 60 \
+    --allow-missing-vision-projection-checkpoint \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --disable-vision-class-token \
+    --use-te \
+    --ckpt-format torch \
+    --num-frames 32 \
+    --use-checkpoint-args \
+    --image-tag-type internvl \
+    --recompute-granularity full \
+    --recompute-method block \
+    --recompute-num-layers 28 \
+    --recompute-vision \
+"
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
+export NVTE_APPLY_QK_LAYER_SCALING=0
+# Interactive or batch mode
+if [[ $BATCH -eq 0 ]]; then
+    torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
+else
+    run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
+    DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+    srun -l --verbose \
+    --container-image <path to docker image> \
+    --container-mounts "<some mount>" \
+    --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
+    sh -c "${run_cmd}"
+    set +x
+fi
--- a/examples/multimodal/pretrain_mistral_clip.sh
+++ b/examples/multimodal/pretrain_mistral_clip.sh
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -25,6 +25,18 @@ from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
 from megatron.inference.text_generation.api import generate_and_post_process
 from megatron.inference.text_generation.forward_step import ForwardStep
 from megatron.inference.text_generation.communication import broadcast_int_list
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.inference_request import InferenceRequest, VLMInferenceRequest
+from megatron.core.inference.text_generation_controllers.vlm_text_generation_controller import (
+    VLMTextGenerationController,
+)
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import (
+    VLMInferenceWrapper,
+)
 from megatron.training import get_args, get_model, get_tokenizer, print_rank_0
 from megatron.training.checkpointing import load_checkpoint
 from megatron.training.initialize import initialize_megatron
@@ -60,6 +72,8 @@ def add_text_generation_args(parser):
            "OCRBench",
            "MathVista",
            "AI2D",
+            "InfoVQA",
+            "SPDocVQA",
        ],
        help="Generation task to run",
    )
@@ -68,6 +82,8 @@ def add_text_generation_args(parser):
    )
    group.add_argument("--config-path", type=str, help="Evaluation config file to use.")
+    group.add_argument("--use-mcore-inference", action="store_true", default=False, help="Use the MCore inference API")
    # Add common multimodal arguments needed for e.g. building the model.
    parser = add_multimodal_extra_args(parser)
@@ -153,15 +169,61 @@ def generate_samples(model, config: EvaluationConfig, print_output):
        args.use_tile_tags,
    )
+    if args.use_mcore_inference:
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=args.hidden_size,
+            inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+            fp32_residual_connection=args.fp32_residual_connection,
+            params_dtype=args.params_dtype,
+            padded_vocab_size=args.padded_vocab_size,
+        )
+        inference_wrapped_model = VLMInferenceWrapper(model, inference_wrapper_config)
+        tokenizer = get_tokenizer()
+        controller = VLMTextGenerationController(
+            inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+        )
+        inference_engine = MCoreEngine(
+            controller, max_batch_size=1, random_seed=args.seed
+        )
+        sampling_params = SamplingParams(
+            temperature=config.temperature,
+            top_k=config.top_k,
+            top_p=config.top_p,
+            num_tokens_to_generate=config.out_seq_length,
+        )
    for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader):
        imgs = imgs.to("cuda")
        num_tiles = num_tiles.to("cuda")
        conv = get_conversation(config.task, question)
+        if not args.use_mcore_inference:
            forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles, args.decoder_seq_length)
        if is_first_rank():
+            if args.use_mcore_inference:
+                inference_request = VLMInferenceRequest(
+                   request_id=inference_engine.get_new_request_id(),
+                   prompt=conv,
+                   prompt_tokens=controller.tokenize_prompt(conv),
+                   inference_parameters=sampling_params,
+                   num_img_embeddings_per_tile=num_img_embeddings_per_tile,
+                   imgs=imgs,
+                   num_tiles=num_tiles,
+                   decoder_seq_length=args.decoder_seq_length,
+                )
+                results: List[InferenceRequest] = inference_engine.generate(
+                    inference_requests=[inference_request]
+                )
+                resp_sentences = [
+                    tokenizer.detokenize(result.prompt_tokens) + result.generated_text
+                    for result in results
+                ]
+            else:
                resp_sentences, _, _, _ = generate_and_post_process(
                model,
                forward_step=forward_step,
@@ -192,6 +254,8 @@ def generate_samples(model, config: EvaluationConfig, print_output):
                    "OCRBench",
                    "MathVista",
                    "AI2D",
+                    "InfoVQA",
+                    "SPDocVQA",
                ):
                    output_name = "answer"
                elif config.task in ("MMMU"):
@@ -220,6 +284,8 @@ def generate_samples(model, config: EvaluationConfig, print_output):
                    "OCRBench",
                    "MathVista",
                    "AI2D",
+                    "InfoVQA",
+                    "SPDocVQA",
                ):
                    if isinstance(answers, str):
                        answers = [answers]
@@ -238,6 +304,21 @@ def generate_samples(model, config: EvaluationConfig, print_output):
                yield output
                idx += 1
+        else:
+            if args.use_mcore_inference:
+                inference_request = VLMInferenceRequest(
+                   request_id=inference_engine.get_new_request_id(),
+                   prompt=conv,
+                   prompt_tokens=controller.tokenize_prompt(conv),
+                   inference_parameters=sampling_params,
+                   num_img_embeddings_per_tile=num_img_embeddings_per_tile,
+                   imgs=imgs,
+                   num_tiles=num_tiles,
+                   decoder_seq_length=args.decoder_seq_length,
+                )
+                inference_engine.generate(
+                    inference_requests=[inference_request]
+                )
            else:
                generate_and_post_process(
                    model, forward_step=forward_step, detokenize_segments=False, data_parallel=True
@@ -310,7 +391,6 @@ def generate_and_write_samples(model, config, print_output=True):
    if is_first_rank():
        output_file.close()
 class VLMForwardStep(ForwardStep):
    """Inference forward step for a multimodal model."""
@@ -411,7 +491,7 @@ def get_conversation(task, question):
                "content": f"{IMAGE_TOKEN}\nProvide a one-sentence caption for provided image.",
            },
        ]
-    elif task in ("TextVQA", "VQAv2", "ChartQA"):
+    elif task in ("TextVQA", "VQAv2", "ChartQA", "InfoVQA", "SPDocVQA"):
        conversation = [
            {"role": "system", "content": "Answer the questions."},
            {
@@ -443,7 +523,7 @@ def get_conversation(task, question):
        conversation = [
            {"role": "system", "content": "Answer the questions."},
-            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{q}"},
        ]
    return conversation
@@ -451,7 +531,7 @@ def get_conversation(task, question):
 def get_prompt_and_generated(prompt_and_generation, prompt_format):
    """Strip prompt and other unnecessary text from generation."""
-    if prompt_format == "llama3":
+    if prompt_format in ("llama3", "llama3p1"):
        splitted = prompt_and_generation.split("<|start_header_id|>assistant<|end_header_id|>\n\n")
        prompt = splitted[0]
        generated = splitted[1]

--- a/examples/multimodal/sft_mistral_clip.sh
+++ b/examples/multimodal/sft_mistral_clip.sh
--- a/examples/multimodal/text_generation_mistral_clip.sh
+++ b/examples/multimodal/text_generation_mistral_clip.sh
--- a/examples/multimodal/train.py
+++ b/examples/multimodal/train.py
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 """Pretrain or SFT multimodal."""
+import math
 import os
 import sys
 from functools import partial
@@ -17,6 +18,7 @@ from multimodal_args import add_multimodal_extra_args
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
+from megatron.core.models.multimodal import context_parallel
 from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, LLaVAModel
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.parallel_state import (
@@ -25,10 +27,10 @@ from megatron.core.parallel_state import (
    is_pipeline_last_stage,
 )
 from megatron.training import get_args, get_timers, get_tokenizer, pretrain
-from megatron.training.utils import is_last_rank
+from megatron.training.utils import is_last_rank, get_batch_on_this_cp_rank
-def get_batch(data_iterator):
+def get_batch(data_iterator, image_token_index, img_seq_len):
    """Generate a batch
    Note: attn_mask_type in layer_specs.py sets the attention mask. Attention mask is None here.
@@ -66,9 +68,17 @@ def get_batch(data_iterator):
    cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"]
    max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"]
-    # No image input (text-only sample) if the dataloader produced a dummy image.
+    # No image input (text-only sample) if the dataloader returned a size 1 image.
    if imgs.shape == torch.Size([1, 1]):
-        # FIXME: text-only data can cause a hang if the vision model is own its own pipeline rank and --freeze-ViT is enabled.
+        # FSDP can hang with text-only samples. A workaround is to run a valid dummy image through the vision
+        # model and then add image embeddings with a zero multiplier.
+        if args.use_torch_fsdp2:
+            imgs = torch.zeros((1, 3, args.img_h, args.img_w), dtype=torch.float32, device=data_text.device)
+            num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
+        else:
+            # Similar workaround is not needed without FSDP and we can use an empty image.
+            # FIXME: text-only data can cause still cause a hang in the special case where
+            # the vision model is own its own pipeline rank and --freeze-ViT is enabled.
            imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
            num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
@@ -109,6 +119,24 @@ def get_batch(data_iterator):
    loss_mask, position_ids = get_ltor_masks_and_position_ids(tokens, labels, tokenizer.pad)
    torch.cuda.nvtx.range_pop()
+    # If context parallel is enabled, must shard inputs to CP ranks.
+    if args.context_parallel_size > 1 or args.sequence_parallel:
+        assert tokens.shape[0], "micro-batch-size > 1 not supported yet with CP"
+        num_image_tokens = torch.sum(tokens == image_token_index).item()
+        num_image_embeddings = num_image_tokens * img_seq_len - num_image_tokens
+        seq_len = text_length + num_image_embeddings
+        # CP expects sequence length is divisible by CP size so apply padding.
+        mp_padding_needed = context_parallel.get_padding(
+            seq_len, args.context_parallel_size,
+            args.tensor_model_parallel_size, args.sequence_parallel,
+        )
+        tokens, position_ids, labels, loss_mask = [torch.nn.functional.pad(item, (0, mp_padding_needed)) for item in (tokens, position_ids, labels, loss_mask)]
+        # Get PackedSeqParams that indicate the amount of padding for TransformerEngine.
+        packed_seq_params = context_parallel.get_packed_seq_params(tokens, num_image_embeddings, mp_padding_needed, args.context_parallel_size, True)
    return (
        tokens,
        labels,
@@ -137,7 +165,82 @@ def get_ltor_masks_and_position_ids(input_ids, target, pad_token):
    return loss_mask, position_ids
+def get_mask_start_and_end_idx(arr):
+    """
+    Returns a list of tuples holding the start and end index in arr of the non-zeros contiguuous
+    sub arrays.
+    For instance, if arr = [0, 1, 0, 0, 1, 1]
+    get_mask_start_and_end_idx(arr) = [(1, 1), (4, 5)]
+    such that arr[1:1+1] = [1] and arr[4:5+1] = [1, 1]
+    """
+    mask = (arr != 0)
+    mask_int = mask.int()
+    diff = mask_int[1:] - mask_int[:-1]
+    start_indices = (diff == 1).nonzero(as_tuple=False).flatten() + 1
+    end_indices = (diff == -1).nonzero(as_tuple=False).flatten()
+    if len(mask)==0: return []
+    if mask[0]:
+        start_indices = torch.cat((torch.tensor([0], device=arr.device), start_indices))
+    if mask[-1]:
+        end_indices = torch.cat((end_indices, torch.tensor([len(arr) - 1], device=arr.device)))
+    sequences = list(zip(start_indices.tolist(), end_indices.tolist()))
+    return sequences
+def scaled_loss_func(loss_mask, output_tensor):
+    """
+    Scaled loss function
+    Scale the loss for each conversation turn using the formula:
+    1 / sum_j[ sqrt(length(loss_turn_j)) ] * sum_i[ sum(loss_turn_i) / sqrt(length(loss_turn_i)) ]
+    Where we use the loss mask to infer the start / end of the conversation turns.
+    """
+    losses = output_tensor.float()
+    loss_list = []
+    num_valid_labels_list = []
+    for idx in range(losses.shape[0]):
+        loss_this_sample = losses[idx]
+        turn_start_end_list = get_mask_start_and_end_idx(loss_mask[idx])
+        for turn_start, turn_end in turn_start_end_list:
+            # compute loss for each turn
+            loss_this_turn = loss_this_sample[turn_start:turn_end+1].sum()
+            assert (1 - loss_mask)[idx][turn_start:turn_end+1].sum() < 1.0
+            num_valid_labels_this_turn = turn_end - turn_start + 1
+            loss_this_turn = loss_this_turn / num_valid_labels_this_turn
+            loss_list.append(loss_this_turn)
+            # append num of valid labels for each turn
+            num_valid_labels_list.append(num_valid_labels_this_turn)
+    base_num = sum([math.sqrt(each) for each in num_valid_labels_list])
+    for idx in range(len(loss_list)):
+        # normalize loss for each turn
+        loss_list[idx] = loss_list[idx] * math.sqrt(num_valid_labels_list[idx]) / base_num
+    total_loss = torch.stack(loss_list).sum()
+    total_tokens = torch.ones_like(total_loss)
+    loss = torch.cat([total_loss.view(1), total_tokens.view(1)])
+    reporting_loss = loss.clone().detach()
+    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
+    local_num_tokens = loss[1].clone().detach().to(torch.int)
+    return (
+        total_loss,
+        local_num_tokens,
+        {'lm loss': (reporting_loss[0], reporting_loss[1])},
+    )
 def loss_func(loss_mask, output_tensor):
+    args = get_args()
    losses = output_tensor.float()
    loss_mask = loss_mask.contiguous().view(-1).float()
@@ -146,12 +249,20 @@ def loss_func(loss_mask, output_tensor):
    total_loss = torch.sum(losses.view(-1) * loss_mask)
    loss = torch.cat([total_loss.view(1), total_tokens.view(1)])
+    if args.context_parallel_size > 1:
+        torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
    reporting_loss = loss.clone().detach()
    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
    local_num_tokens = loss[1].clone().detach().to(torch.int)
-    return (total_loss, local_num_tokens, {'lm loss': (reporting_loss[0], reporting_loss[1])})
+    # We multiply by context parallel size because later there will be a divide by CP(+DP) size.
+    return (
+        loss[0] * args.context_parallel_size,
+        local_num_tokens,
+        {'lm loss': (reporting_loss[0], reporting_loss[1])}
+    )
 def forward_step(data_iterator, model: LLaVAModel):
@@ -178,7 +289,7 @@ def forward_step(data_iterator, model: LLaVAModel):
        images,
        num_image_tiles,
        packed_seq_params,
-    ) = get_batch(data_iterator)
+    ) = get_batch(data_iterator, model.module.module.image_token_index, model.module.module.img_seq_len)
    timers('batch-generator').stop()
    output_tensor, loss_mask = model(
@@ -191,8 +302,13 @@ def forward_step(data_iterator, model: LLaVAModel):
        num_image_tiles=num_image_tiles,
        packed_seq_params=packed_seq_params,
    )
+    args = get_args()
+    if args.use_loss_scaling:
+        loss_function = partial(scaled_loss_func, loss_mask)
+    else:
+        loss_function = partial(loss_func, loss_mask)
-    return output_tensor, partial(loss_func, loss_mask)
+    return output_tensor, loss_function
 def llava_embedding_ranks(pp_ranks):

--- a/examples/retro/preprocess_data.sh
+++ b/examples/retro/preprocess_data.sh
--- a/examples/retro/train_retro_2b_distributed.sh
+++ b/examples/retro/train_retro_2b_distributed.sh
--- a/examples/t5/train_t5_220m_distributed.sh
+++ b/examples/t5/train_t5_220m_distributed.sh