升级megatron_v0.10[B

3bec6514 · xingjinliang · 3aca1415 · 3bec6514 · 3bec6514 · 3bec6514
Commit 3bec6514 authored Dec 09, 2024 by xingjinliang
20 changed files
--- a/examples/multimodal/config.py
+++ b/examples/multimodal/config.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from dataclasses import dataclass
+
+import torch
+
+from megatron.training.activations import fast_gelu, quick_gelu, squared_relu
+
+
+def get_language_model_config(config):
+    if config.language_model_type == "2b":
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = True
+        config.layernorm_zero_centered_gamma = True
+        config.bias_dropout_fusion = False
+        config.rotary_percent = 0.5
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+    elif config.language_model_type == "8b":
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = False
+        config.apply_query_key_layer_scaling = True
+        config.layernorm_zero_centered_gamma = True
+        config.bias_dropout_fusion = False
+        config.rotary_percent = 0.5
+        config.attention_dropout = 0.0
+        config.apply_rope_fusion = False
+        config.activation_func = squared_relu
+        config.ffn_hidden_size = 16384
+        config.masked_softmax_fusion = True
+        config.attention_softmax_in_fp32 = True
+        config.num_query_groups = 32
+        config.kv_channels = 128
+        config.rotary_interleaved = False
+    elif config.language_model_type == "llama3_8b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "mistral_7b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "yi-34b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 20480
+    elif config.language_model_type == "qwen2.0_72B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.add_qkv_bias = True
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 29568
+    else:
+        raise ValueError(f"unknown language model type {config.language_model_type}")
+
+    return config
+
+
+def get_vision_model_config(config, apply_query_key_layer_scaling):
+    if config.vision_model_type == "clip":
+        config.num_layers = 24
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1024
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4096
+        config.gated_linear_unit = False
+        config.activation_func = quick_gelu
+        config.kv_channels = 64
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+    elif config.vision_model_type == "siglip":
+        config.num_layers = 27
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1152
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4304
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 72
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
+    elif config.vision_model_type == "internvit":
+        config.num_layers = 45
+        config.num_attention_heads = 32     # Padded for TP=8.
+        config.num_query_groups = 32    # Padded for TP=8.
+        config.kv_channels = 128
+        config.add_bias_linear = True
+        config.add_qkv_bias = False
+        config.hidden_size = 3200
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 12800
+        config.gated_linear_unit = False
+        config.activation_func = torch.nn.functional.gelu
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'RMSNorm'
+        config.layernorm_epsilon = 1e-6
+        config.apply_rope_fusion = False
+    else:
+        raise ValueError(f"unknown vision model type {config.vision_model_type}")
+
+    return config
+
+
+def get_vision_projection_config(config, hidden_size):
+    config.gated_linear_unit = False
+    config.bias_activation_fusion = False
+    config.add_bias_linear = False
+    config.hidden_size = hidden_size  # Used as the vision projection output size, i.e., the input to the language model.
+    if config.language_model_type == "2b":
+        config.ffn_hidden_size = 5440
+        config.activation_func = torch.nn.functional.gelu
+    if config.language_model_type == "8b":
+        config.ffn_hidden_size = 16384
+        config.activation_func = squared_relu
+    elif config.language_model_type == "llama3_8b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "mistral_7b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.gelu
+        config.normalization = None
+    elif config.language_model_type == "yi-34b":
+        config.ffn_hidden_size = 20480
+        config.normalization = "LayerNorm"
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.0_72B":
+        config.ffn_hidden_size = 29568
+        config.normalization = "LayerNorm"
+        config.activation_func = torch.nn.functional.gelu
+    else:
+        raise ValueError(f"unknown language model type {config.language_model_type}")
+
+    return config
+
+
+@dataclass
+class EvaluationConfig:
+    """Evaluation related configuration."""
+    task: str
+
+    temperature: float = 1.0
+    top_p: float = 0.0
+    top_k: int = 0
+
+    out_seq_length: int = 32
+
+    output_path: str = ""
+
+    input_image_path: str = ""
+    gt_path: str = ""
+
+    num_partitions: int = 1
+    partition_id: int = 0
+    num_samples_per_partition: int = 0
--- a/examples/multimodal/convert_llava_pretrain_to_wds.py
+++ b/examples/multimodal/convert_llava_pretrain_to_wds.py
+import json
+import os
+import webdataset as wds
+
+from tqdm import tqdm
+
+llava_pretrain_dir = '<path_to_LLaVA-Pretrain>'
+
+# Paths to the dataset files
+json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json')
+output = os.path.join(llava_pretrain_dir, 'wds')
+
+if not os.path.exists(output):
+    os.mkdir(output)
+
+# Load data
+with open(json_file, 'r') as f:
+    data = json.load(f)
+
+with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer:
+    for entry in tqdm(data):
+        with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file:
+                image_data = img_file.read()
+        sample = {
+            "__key__": entry['id'],
+            "jpg": image_data,
+            "json": json.dumps(entry['conversations']).encode("utf-8"),
+        }
+        shard_writer.write(sample)
+
+print(f"Dataset successfully converted to wds")
--- a/examples/multimodal/dataloader_provider.py
+++ b/examples/multimodal/dataloader_provider.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import os
+
+import torch
+from dataset_helpers import TaskEncoder, print_error_handler
+
+from megatron.core import parallel_state
+from megatron.energon import (
+    LimitDataset,
+    RepeatDataset,
+    WorkerConfig,
+    get_loader,
+    get_savable_loader,
+    get_train_dataset,
+    get_val_datasets,
+)
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.core.parallel_state import get_tensor_model_parallel_rank, get_pipeline_model_parallel_world_size, get_pipeline_model_parallel_rank
+from megatron.training import get_args
+from megatron.training.checkpointing import get_checkpoint_name
+
+
+def datasets_provider(worker_config=None):
+    """Create multimodal train, validation and test datasets."""
+    args = get_args()
+
+    dname = args.data_path[0] if type(args.data_path) is list else args.data_path
+    train_dataset = get_train_dataset(
+        dname,
+        batch_size=args.micro_batch_size,
+        task_encoder=TaskEncoder(),
+        worker_config=worker_config,
+        max_samples_per_sequence=None,
+        shuffle_buffer_size=None,
+        packing_buffer_size=args.packing_buffer_size,
+        handler=print_error_handler,
+        image_decode="pil",
+    )
+
+    val_datasets = get_val_datasets(
+        dname,
+        batch_size=args.micro_batch_size,
+        # This is the total number over all workers
+        # limit=args.eval_iters * get_num_microbatches(),
+        task_encoder=TaskEncoder(),
+        worker_config=worker_config,
+        packing_buffer_size=args.packing_buffer_size,
+        handler=print_error_handler,
+        image_decode="pil",
+    )
+    val_datasets_without_source_datasets = [
+        # Limit the dataset to eval_iters * num_microbatches
+        LimitDataset(
+            # Repeat the inner dataset in case it's too short
+            RepeatDataset(val_ds, worker_config=worker_config),
+            length=args.eval_iters * get_num_microbatches(),
+            worker_config=worker_config,
+            reset_after_epoch=True,
+        )
+        for val_ds, _src_ds in val_datasets
+    ]
+
+    return train_dataset, val_datasets_without_source_datasets, None
+
+
+def is_first_or_last_stage(pp_size, encoder_pipeline_model_parallel_size):
+    """Check if the current pipeline parallel stage is the first or last stage."""
+    if pp_size == 1:    # No pipeline parallelism.
+        return True
+
+    is_valid_rank = False
+    pp_rank = get_pipeline_model_parallel_rank()
+    if encoder_pipeline_model_parallel_size == 0:
+        # No separate pipeline stage for the vision model. Run the dataloader on the first and last pipeline stage.
+        is_valid_rank = pp_rank in (0, pp_size-1)
+    elif encoder_pipeline_model_parallel_size == 1:
+        # Separate pipeline stage for the vision model. Run the dataloader on the first vision and LM stage and last LM stage.
+        is_valid_rank = pp_rank in (0, 1, pp_size-1)
+    else:
+        raise NotImplementedError("encoder-pipeline-model-parallel-size > 1 is not supported yet")
+
+    return is_valid_rank
+
+
+def is_dataloader_rank(encoder_pipeline_model_parallel_size):
+    """Check if we should have the dataloader on this tensor and pipeline parallel rank."""
+    # Run dataloader only on the first tensor parallel rank (will be broadcasted to others).
+    is_first_rank = get_tensor_model_parallel_rank() == 0
+
+    pp_size = get_pipeline_model_parallel_world_size()
+    is_first_rank = is_first_rank and is_first_or_last_stage(pp_size, encoder_pipeline_model_parallel_size)
+
+    return is_first_rank
+
+
+def train_valid_test_dataloaders_provider(train_val_test_num_samples):
+    """Build multimodal train, validation and test dataloaders."""
+    args = get_args()
+
+    # Dataloader is only on specific ranks.
+    if not is_dataloader_rank(args.encoder_pipeline_model_parallel_size):
+        return None, None, None
+
+    worker_debug_path = None
+    worker_log_level = 0
+
+    rank = parallel_state.get_data_parallel_rank()
+    world_size = parallel_state.get_data_parallel_world_size()
+    data_parallel_group = parallel_state.get_data_parallel_group()
+
+    worker_config = WorkerConfig(
+        rank=rank,
+        world_size=world_size,
+        num_workers=args.num_workers,
+        data_parallel_group=data_parallel_group,
+        worker_debug_path=worker_debug_path,
+        worker_log_level=worker_log_level,
+    )
+    train_ds, valid_ds1, test_ds = datasets_provider(worker_config)
+
+    train_dataloader = get_savable_loader(train_ds, worker_config=worker_config)
+    if args.load is not None:
+        if getattr(args, "dataloader_save", None):
+            dp_rank = parallel_state.get_data_parallel_rank()
+            data_save_name = get_checkpoint_name(
+                args.dataloader_save,
+                args.iteration,
+                pipeline_rank=0,    # Only the first pipeline parallel rank stores the dataloader checkpoint.
+                basename=f"train_dataloader_dprank{dp_rank:03d}.pt",
+            )
+            if os.path.exists(data_save_name):
+                try:
+                    dataset_state_dict = torch.load(data_save_name, map_location="cpu")
+                    train_dataloader.restore_state_rank(dataset_state_dict["dataloader_state_dict"])
+                    print(f"restored dataset state from {data_save_name}")
+                except Exception as e:
+                    print("loading dataset state failed. Skipping. " + str(e))
+            else:
+                print(f"dataset state {data_save_name} does not exist")
+
+    valid_dataloader = [
+        EnergonDataloader(get_loader(valid_ds, worker_config=worker_config))
+        for valid_ds in valid_ds1
+    ]
+    test_dataloader = None
+
+    return EnergonDataloader(train_dataloader), valid_dataloader, EnergonDataloader(test_dataloader)
+
+
+class EnergonDataloader:
+    """A wrapper to use Megatron Energon dataloader with the Megatron-LM training loop."""
+    def __init__(self, dataloader):
+        self._dataloader = dataloader
+        self._iter = iter(cyclic_iter(dataloader))
+
+    def __next__(self):
+        return self._iter.__next__()
+
+    def __iter__(self):
+        return self._iter.__iter__()
+
+    def save_state(self):
+        return self._dataloader.save_state_rank()
+
+
+def cyclic_iter(iter):
+    while True:
+        for x in iter:
+            yield x
--- a/examples/multimodal/dataset_helpers.py
+++ b/examples/multimodal/dataset_helpers.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import bisect
+import dataclasses
+import json
+import sys
+import traceback
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+from image_processing import get_visual_transform
+import numpy as np
+import torch
+
+from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.energon import (
+    Batch,
+    CaptioningSample,
+    DefaultTaskEncoder,
+    OCRSample,
+    Sample,
+    SimilarityInterleavedSample,
+    VQASample,
+    MultiChoiceVQASample
+)
+from megatron.energon.task_encoder.base import stateless
+from megatron.training import get_args, get_tokenizer
+
+
+@dataclass
+class ImageTaskSample(Sample):
+    __key__: str
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict
+    __subflavors__: Dict
+    # (c, h, w)
+    imgs: List[torch.Tensor]
+    num_tiles: List[int]
+    tokens: torch.Tensor
+    total_len: int  # Total token count in the sample, including text and image tokens
+    labels: torch.Tensor = None
+
+
+@dataclass
+class ImageTaskSamplePacked(Sample):
+    """Dataclass to store a single packed sample (not a batch).
+
+        P = Number of sub-samples in the packed sample
+        seq_len = Total sequence length
+        num_imgs = Number of images across all samples in the packed sample
+    """
+
+    __key__: str    # Sample name
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict     # Sample metadata. Deprecated.
+    __subflavors__: Dict    # Sample metadata.
+    tokens: torch.Tensor  # Input tokens packed into a single tensor (seq_len,)
+    labels: torch.Tensor # Target tokens packed into a single tensor (seq_len,)
+    imgs: List[torch.Tensor]    # Input images
+    num_tiles: List[int]  # Number of tiles for each image of each sample (num_imgs)
+    max_length: int    # Maximum length across sub-samples.
+    cu_lengths: List[int]  # Cumulative length of each sub-sample in this packed sample incl. text and image tokens (P,)
+
+
+# Typing for the resulting batch data after encode_batch()
+@dataclass
+class ImageTaskBatchPacked(Batch):
+    """Dataclass to store a batch of packed samples.
+
+        N = Batch size
+        P = Number of samples in the packed sample
+        seq_len = Maximum sequence length
+        num_imgs = Number of images across all samples in the packed sample
+    """
+
+    __key__: List[str]  # Sample names
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict     # Sample metadata. Deprecated.
+    __subflavors__: List[Dict]  # Sample metadatas.
+    tokens: torch.Tensor  # Input tokens packed and padded (N, seq_len)
+    labels: torch.Tensor # Target tokens packed and padded (N, seq_len)
+    imgs: torch.Tensor  # All image tiles stacked into a single tensor (num_tiles, C, H, W)
+    num_tiles: List[List[int]]  # Number of tiles per image (N, num_imgs)
+    max_lengths: List[int]  # Maximum length across sub-samples (N,)
+    cu_lengths: List[List[int]]  # Cumulative length of each sub-sample in each packed sample of the batch (N, P)
+
+
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L19
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def search_for_fit(numbers: List[int], capacity: int) -> int:
+    """Finds the index of largest number that fits into the knapsack with the given capacity."""
+    index = bisect.bisect(numbers, capacity)
+    return -1 if index == 0 else (index - 1)
+
+
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L27
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: int) -> List:
+    """Greedy algorithm with binary search for the knapsack problem.
+
+    Pack as many samples as possible given a maximum capacity and capacities of individual samples.
+    Used if sequence packing is enabled.
+    """
+    assert len(item_sizes) == len(samples), "sample lengths and samples must have the same length."
+
+    knapsacks = []
+
+    if len(item_sizes) == 0:
+        return knapsacks
+
+    # Sort sample lengths and samples together.
+    sorted_item_sizes, sorted_samples = zip(*sorted(zip(item_sizes, samples), key=lambda x: x[0]))
+    sorted_item_sizes = list(sorted_item_sizes)
+    sorted_samples = list(sorted_samples)
+
+    # Check if all samples fit in the knapsack capacity.
+    if sorted_item_sizes[-1] > max_capacity:
+        raise ValueError(f"knapsack: A sample is larger {sorted_item_sizes[-1]} than the max_sequence_length {max_capacity}.")
+
+    while sorted_item_sizes:
+        current_knapsack = []
+        remaining_capacity = max_capacity
+
+        while True:
+            idx = search_for_fit(sorted_item_sizes, remaining_capacity)
+            if idx == -1:
+                break   # Can't fit more samples.
+
+            remaining_capacity -= sorted_item_sizes[idx]
+
+            sorted_item_sizes.pop(idx)
+            sample = sorted_samples.pop(idx)
+            current_knapsack.append(sample)
+
+        knapsacks.append(current_knapsack)
+
+    return knapsacks
+
+
+class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, dict]):
+    """A simple task encoder for VLMs."""
+
+    def __init__(
+        self
+    ):
+        super().__init__()
+
+        self.args = get_args()
+
+        self.tokenizer = get_tokenizer()
+        with open(self.args.prompt_path, "r") as f:
+            self.manual_prompts = json.load(f)
+        self.dataloader_seq_length = self.args.dataloader_seq_length  # Always return samples of this length.
+        self.packing_seq_length = self.args.packing_seq_length     # Packing sequence length, if packing is enabled.
+        self.is_packing_enabled = self.args.packing_buffer_size is not None and self.args.packing_buffer_size > 0
+
+        if self.dataloader_seq_length and self.packing_seq_length:
+            assert self.dataloader_seq_length >= self.packing_seq_length, "dataloader sequence length must be greater than or equal to the packing sequence length"
+
+        if self.is_packing_enabled:
+            assert self.packing_seq_length > 0, "packing sequence length must be set"
+
+        self.num_image_embeddings_per_tile = get_num_image_embeddings(
+            self.args.img_h,
+            self.args.img_w,
+            self.args.patch_dim,
+            self.args.vision_model_type,
+            self.args.disable_vision_class_token,
+            1,
+            self.args.pixel_shuffle,
+            self.args.use_tile_tags,
+        )
+
+        self.txt_to_token_dict = {}
+
+        self.img_h, self.img_w = self.args.img_h, self.args.img_w
+
+    def _get_total_seq_length(self, input_ids, num_tiles):
+        """Calculate expected sequence length given text tokens length and number of tiles."""
+        total_num_images = len(num_tiles)
+        total_num_tiles = sum(num_tiles)
+        total_len = len(input_ids) + total_num_tiles * self.num_image_embeddings_per_tile - total_num_images
+        return total_len
+
+    def _truncate_for_packing(self, input_ids, target, num_tiles):
+        """Truncate tokens and labels if they exceed packing sequence length."""
+        total_num_images = len(num_tiles)
+        total_num_tiles = sum(num_tiles)
+        total_img_embeddings_len = total_num_tiles * self.num_image_embeddings_per_tile
+        max_text_tokens = self.packing_seq_length - total_img_embeddings_len + total_num_images
+
+        input_ids = input_ids[:max_text_tokens]
+        target = target[:max_text_tokens]
+
+        # If truncate causes all labels to be ignored, then skip the sample
+        if (target == IGNORE_INDEX).all():
+            raise ValueError(f"all targets will be ignored after truncation: {input_ids}")
+
+        return input_ids, target
+
+    @stateless(restore_seeds=True)
+    def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQASample, SimilarityInterleavedSample]):
+        if isinstance(sample, OCRSample):
+            if "pdfa" in sample.__key__:
+                yield self.combined_ocr_encoder(sample, task_type='encode_pdf')
+            elif "multi" in sample.__key__:
+                yield self.combined_ocr_encoder(sample, task_type='_encode_ocr')
+            else:
+                yield self.combined_ocr_encoder(sample, task_type='encode_ocr_ref')
+        elif isinstance(sample, CaptioningSample):
+            yield self.encode_captioning(sample)
+        elif isinstance(sample, VQASample):
+            is_llava_training = sample.__subflavors__["is_llava_training"] if "is_llava_training" in sample.__subflavors__ else False
+
+            if "llava" in sample.__key__ or is_llava_training:
+                yield self.encode_llava_pretrain(sample)
+            else:
+                yield self.encode_any_single_turn_vqa(sample)
+        elif isinstance(sample, SimilarityInterleavedSample):
+            yield self.encode_llava_sft(sample)
+        elif isinstance(sample, MultiChoiceVQASample):
+            yield self.encode_any_single_turn_vqa(sample)
+        else:
+            raise NotImplementedError("Sample format not supported", sample)
+
+    def encode_captioning(self, sample: CaptioningSample):
+        """Encode CaptioningSample."""
+        augment = sample.__subflavors__.get("augmentation")
+
+        imgs = get_visual_transform(
+            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+            self.args.vision_model_type,
+        )
+        num_tiles = [len(imgs)]
+
+        prompt_list = self.manual_prompts["CaptioningPretraining"]["raw"]
+
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        cur_prompt = "<image>\n" + cur_prompt + "\n"
+
+        caption = sample.caption.strip()
+
+        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
+        if split_by_line_flag:
+            caption_list = caption.split('\n')
+            caption = np.random.choice(caption_list)
+
+        conv = [
+            # Note: no system message.
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": caption},
+        ]
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def encode_llava_pretrain(self, sample: VQASample):
+        """Encode pretrain sample in LLAVA style."""
+        augment = sample.__subflavors__.get("augmentation", False)
+
+        imgs = get_visual_transform(
+            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+            self.args.vision_model_type,
+        )
+        num_tiles = [len(imgs)]
+
+        # LLAVA training: override text-prompt with just the image.
+        conv = [
+            # Note: no system message.
+            {"role": "user", "content": "<image>\n"},
+            {"role": "assistant", "content": sample.answers},
+        ]
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def encode_llava_sft(self, sample: SimilarityInterleavedSample):
+        """Encode SFT sample."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
+        has_image = sample.__subflavors__['has_image'] if 'has_image' in sample.__subflavors__ else False
+        has_image = has_image or (hasattr(sample, "images") and len(sample.images) > 0)
+
+        if has_video:
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, height, width, num_channels).
+            video_fhwc = sample.images[0].permute(0, 2, 3, 1)
+            selected_frames = torch.linspace(
+                0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
+            video_frame_fhwc = video_fhwc[selected_frames]
+            imgs = []
+            for video_frame_hwc in video_frame_fhwc:
+                imgs += get_visual_transform(
+                    video_frame_hwc, self.img_h, self.img_w,
+                    self.args.use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment, self.args.vision_model_type)
+            num_tiles = [len(imgs)]
+        elif has_image:
+            imgs = get_visual_transform(
+                sample.images[0], self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+                self.args.vision_model_type,
+            )
+            num_tiles = [len(imgs)]
+        else:
+            imgs = num_tiles = []
+            sample.__key__ = "{}-{}".format("no-image", sample.__key__)
+
+        conversation = []
+        # Note: Some tokenizers may ignore the system prompt.
+        conversation.append({"role": "system", "content": "Answer the questions."})
+
+        has_image_token = False
+
+        for text in sample.texts:
+            if IMAGE_TOKEN in text["value"]:
+                has_image_token = True
+
+            if text["from"] == "human":
+                role = "user"
+            elif text["from"] == "gpt":
+                role = "assistant"
+            else:
+                raise RuntimeError(f"unexpected role {text['from']} in {sample.texts}")
+
+            turn = {"role": role, "content": text["value"]}
+            conversation.append(turn)
+
+        # If the sample contains an image but none of the user messages has an image token,
+        # then add it to the first user message.
+        if len(imgs) > 0 and not has_image_token:
+            for turn in conversation:
+                if turn["role"] == "user":
+                    turn["content"] = f"{IMAGE_TOKEN}\n" + turn["content"]
+                    break
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def encode_any_single_turn_vqa(self, sample):
+        """Encode MultiChoiceVQA or VQA sample."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
+
+        if has_video:
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, height, width, num_channels).
+            video_fhwc = sample.image.permute(0, 2, 3, 1)
+            selected_frames = torch.linspace(
+                0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
+            video_frame_fhwc = video_fhwc[selected_frames]
+            imgs = []
+            for video_frame_hwc in video_frame_fhwc:
+                imgs += get_visual_transform(
+                    video_frame_hwc, self.img_h, self.img_w,
+                    self.args.use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment, self.args.vision_model_type)
+        else:
+            imgs = get_visual_transform(
+                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
+                self.args.use_thumbnail, augment, self.args.vision_model_type,
+            )
+
+        num_tiles = [len(imgs)]
+
+        if isinstance(sample, MultiChoiceVQASample):
+            cur_prompt = format_multichoice_question(sample.context, sample.choices)
+            if "<image>" not in cur_prompt:
+                cur_prompt = "<image>\n" + cur_prompt
+            cur_answer = format_multichoice_answer(sample.correct_choice_idx)
+        elif isinstance(sample, VQASample):
+            if 'docvqa' in sample.__key__:
+                prompt_list = self.manual_prompts["VQASFT"]["docvqa"]
+            elif sample.__subflavors__.get("VQASFT"):
+                prompt_list = self.manual_prompts["VQASFT"]["raw"]
+            else:
+                prompt_list = ["{}"]
+
+            prompt_idx = np.random.randint(len(prompt_list))
+            cur_prompt = prompt_list[prompt_idx]
+
+            cur_prompt = cur_prompt.format(sample.context)
+
+            if "<image>" not in cur_prompt:
+                cur_prompt = "<image>\n" + cur_prompt
+
+            if isinstance(sample.answers, list):
+                answer_list = sample.answers
+                weight_list = np.array(sample.answer_weights).astype(np.float32)
+                weight_list = weight_list / np.sum(weight_list)
+                answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0]
+                cur_answer = answer_list[answer_idx]
+            else:
+                cur_answer = sample.answers
+        else:
+            raise NotImplementedError("Unsupported data type provided", sample)
+
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": str(cur_answer)},
+        ]
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def combined_ocr_encoder(self, sample, task_type):
+        """Encode OCR samples."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+
+        if task_type == "encode_pdf":
+            sample, cur_prompt, cur_answer = self.encode_pdf_prompt(sample)
+        elif task_type == "encode_ocr_ref":
+            sample, cur_prompt, cur_answer = self.encode_ocr_ref_prompt(sample)
+        elif task_type == "_encode_ocr":
+            sample, cur_prompt, cur_answer = self.encode_ocr_prompt(sample)
+
+        imgs = get_visual_transform(
+                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
+                self.args.use_thumbnail, augment, self.args.vision_model_type,
+            )
+        num_tiles = [len(imgs)]
+
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": str(cur_answer)},
+        ]
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def encode_pdf_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        prompt_list = self.manual_prompts["DocPretraining"]["raw"]
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        if "<image>" not in cur_prompt:
+            cur_prompt = "<image>\n" + cur_prompt
+
+        # Make sure there is no extra <image> tag.
+        sample.text = sample.text.replace("<image>", "")
+
+        caption = sample.text.strip()
+
+        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
+        if split_by_line_flag:
+            caption_list = caption.split('\n')
+            caption = np.random.choice(caption_list)
+        cur_answer = caption
+
+        return sample, cur_prompt, cur_answer
+
+    def encode_ocr_ref_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        ref = sample.text
+        region = sample.words_boxes
+
+        # Make sure there is no extra <image> tag
+        ref = ref.replace("<image>", "")
+
+        if len(region) == 4:
+            region = f"<box>({region[0]},{region[1]}),({region[2]},{region[3]})</box>"
+        else:
+            region = f"<quad>({region[0]},{region[1]}),({region[2]},{region[3]}),({region[4]},{region[5]}),({region[6]},{region[7]})</quad>"
+
+        # Randomly choose between two tasks
+        task_idx = np.random.randint(2)
+        if task_idx == 0:
+            # Referring Grounding
+            prompt_list = self.manual_prompts["DocPretraining"]["referring_grounding"]
+            prompt_content = ref
+            answer = region
+        else:
+            # Grounded OCR
+            prompt_list = self.manual_prompts["DocPretraining"]["grounded_ocr"]
+            prompt_content = region
+            answer = ref
+
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        cur_prompt = cur_prompt.format(prompt_content)
+        if "<image>" not in cur_prompt:
+            cur_prompt = "<image>\n" + cur_prompt
+
+        return sample, cur_prompt, answer
+
+    def bbox_coord_to_label(self, text, bbox):
+        """Format bbox coordinates as text."""
+        assert len(bbox) == 4 or len(bbox) == 8
+
+        # Make sure there is no extra <image> tag
+        text = text.replace("<image>", "")
+
+        if len(bbox) == 4:
+            label_str = f"<ref>{text}</ref><box>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]})</box>"
+        else:
+            label_str = f"<ref>{text}</ref><quad>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]}),({bbox[4]},{bbox[5]}),({bbox[6]},{bbox[7]})</quad>"
+
+        return label_str
+
+    def encode_ocr_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        if isinstance(sample.words_boxes[0], int):
+            answer = self.bbox_coord_to_label(sample.text, sample.words_boxes)
+        elif isinstance(sample.words_boxes[0], list):
+            answer = ""
+            for i, bbox in enumerate(sample.words_boxes):
+                answer += self.bbox_coord_to_label(sample.words_text[i], bbox)
+
+        prompt_list = self.manual_prompts["DocPretraining"]["ocr_multi"]
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+
+        if "<image>" not in cur_prompt:
+            cur_prompt = "<image>\n" + cur_prompt
+        cur_answer = answer
+
+        return sample, cur_prompt, cur_answer
+
+    def batch(self, samples: List[Union[ImageTaskSample, ImageTaskSamplePacked]]) -> ImageTaskBatchPacked:
+        # Stack images to [num_tiles, c, h, w]. If there are no images (text-only), then use a dummy image.
+        imgs = [img for s in samples for img in s.imgs]
+        if len(imgs) > 0:
+            imgs = torch.stack(imgs)
+        else:
+            imgs = torch.tensor([[0]], dtype=torch.float32)
+
+        # If the user hasn't defined a target dataloader sequence length, then use the max along the sample lengths.
+        max_seq_len = self.dataloader_seq_length
+        if not max_seq_len:
+           max_seq_len = max(len(s.tokens) for s in samples)
+
+        tokens = np.full((len(samples), max_seq_len), self.tokenizer.pad, dtype=np.int64)
+        # +1 to accommodate shift to left by one later.
+        labels = np.full((len(samples), max_seq_len + 1), self.tokenizer.pad, dtype=np.int64)
+
+        for i, s in enumerate(samples):
+            # If the sample/target length exceeds the target sequence length, then truncate.
+            text_len = min(max_seq_len, len(s.tokens))
+            target_len = min(max_seq_len+1, len(s.labels))
+
+            tokens[i, :text_len] = s.tokens[:text_len]
+            labels[i, :target_len] = s.labels[:target_len]
+
+        num_tiles = torch.tensor([n for s in samples for n in s.num_tiles], dtype=torch.int32)
+        if len(num_tiles) == 0:
+            num_tiles = torch.tensor([[0]], dtype=torch.int32)
+
+        # Cumulative sample lengths are needed for packing, otherwise use dummy values.
+        cu_lengths = torch.tensor([[0]], dtype=torch.int32)
+        max_lengths = torch.tensor([[0]], dtype=torch.int32)
+
+        if self.is_packing_enabled:
+            cu_lengths = torch.stack([s.cu_lengths for s in samples])
+            max_lengths = torch.tensor([s.max_length for s in samples], dtype=torch.int32)
+
+        return ImageTaskBatchPacked(
+            __key__=[s.__key__ for s in samples],
+            __restore_key__=[s.__restore_key__ for s in samples],
+            __subflavor__=None,
+            __subflavors__=samples[0].__subflavors__,
+            tokens=tokens,
+            labels=labels,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            cu_lengths=cu_lengths,
+            max_lengths=max_lengths,
+        )
+
+    def encode_batch(self, batch: ImageTaskBatchPacked) -> dict:
+        raw = dataclasses.asdict(batch)
+        del raw["__subflavors__"]
+        return raw
+
+    def select_samples_to_pack(self, samples: List[ImageTaskSample]) -> List[List[ImageTaskSample]]:
+        """Selects which samples will be packed together.
+
+        NOTE: Energon dataloader calls this method internally if packing is used.
+        Please see https://nvidia.github.io/Megatron-Energon/packing.html
+        """
+        lengths = [sample.total_len for sample in samples]
+
+        packed_samples = greedy_knapsack(lengths, samples, self.packing_seq_length)
+
+        return packed_samples
+
+    @stateless
+    def pack_selected_samples(self, samples: List[ImageTaskSample]) -> List[ImageTaskSamplePacked]:
+        """
+        Function to pack a list of ImageTaskSample into a single ImageTaskSamplePacked.
+
+        NOTE: Energon dataloader calls this method internally if packing is used.
+        Please see https://nvidia.github.io/Megatron-Energon/packing.html
+
+        Args:
+            samples: List of ImageTaskSample instances to pack into one sample.
+
+        Returns:
+            ImageTaskSamplePacked instance.
+        """
+        packing_seq_len = self.packing_seq_length
+
+        packed_tokens = []
+        packed_labels = []
+        packed_imgs = []
+
+        current_length = 0
+        max_length = 0
+        cu_lengths = [0]
+
+        # Process each sample and build lists that we will concatenate to create the packed sample.
+        for _, sample in enumerate(samples):
+            sample_len = sample.total_len
+
+            if sample_len > max_length:
+                max_length = sample_len
+
+            # If adding this sample exceeds the max length, stop.
+            # This should not happen. The select_samples_to_pack method should have already ensured that the samples fit.
+            if current_length + sample_len > packing_seq_len:
+                raise ValueError(f"Packed sample exceeds the maximum sequence length of {packing_seq_len}: {samples}")
+
+            # Add the sample's tokens and labels
+            packed_tokens.append(sample.tokens)
+            packed_labels.append(sample.labels)
+
+            # Add the images
+            packed_imgs += sample.imgs
+
+            current_length += sample_len
+            cu_lengths.append(current_length)
+
+        # Concatenate packed tokens and labels.
+        packed_tokens = torch.cat(packed_tokens, dim=0)
+        packed_labels = torch.cat(packed_labels, dim=0)
+
+        return ImageTaskSamplePacked(
+            __key__=",".join([s.__key__ for s in samples]),
+            __restore_key__=(),  # Will be set by energon based on `samples`
+            __subflavor__=None,
+            __subflavors__=samples[0].__subflavors__,
+            tokens=packed_tokens,
+            labels=packed_labels,
+            imgs=packed_imgs,
+            cu_lengths=torch.tensor(cu_lengths, dtype=torch.int32),
+            max_length=max_length,
+            num_tiles=[n for s in samples for n in s.num_tiles],
+        )
+
+
+def print_error_handler(exc: Exception, key: Optional[str]):
+    print(
+        f"The following exception occurred in the dataloader for sample {key} and is skipped",
+        file=sys.stderr,
+    )
+    traceback.print_exc()
+
+
+def format_multichoice_question(question, multichoice_options):
+    """Format multi-choice question."""
+    options_text = ["{}. {}\n".format(chr(ord('A') + i), option) for i, option in
+                    zip(range(len(multichoice_options)), multichoice_options)]
+    options_text = "".join(options_text)
+
+    options_text = f"{options_text}Answer with the option's letter from the given choices directly."
+
+    return "{}\n{}".format(question, options_text)
+
+
+def format_multichoice_answer(idx):
+    """Format multi-choice answer."""
+    return chr(ord('A') + idx)
--- a/examples/multimodal/evaluate_ai2d.py
+++ b/examples/multimodal/evaluate_ai2d.py
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from evaluate_vqav2 import compute_vqa_accuracy
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="AI2D")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(
+                    {
+                        "question_id": res["sample_id"],
+                        "answer": res["answer"],
+                        "gt_answer": res["gt_answer"],
+                    }
+                )
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def ai2d_eval(input_path):
+    """Run AI2D evaluation."""
+    result_file_path = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file_path, task="AI2D")
+    return avg_acc
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = ai2d_eval(args.input_path)
+
+    print(f"===== AI2D Accuracy {avg_acc:.2f}% =====")
--- a/examples/multimodal/evaluate_chartqa.py
+++ b/examples/multimodal/evaluate_chartqa.py
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from evaluate_vqav2 import compute_vqa_accuracy
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="ChartQA")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                res["question_id"] = res["sample_id"]
+
+                results.append(res)
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def chartqa_eval(input_path):
+    """Run ChartQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    return compute_vqa_accuracy(result_file_path, task="ChartQA")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = chartqa_eval(args.input_path)
+
+    print(f"ChartQA accuracy: {avg_acc:.2f}")
--- a/examples/multimodal/evaluate_coco.py
+++ b/examples/multimodal/evaluate_coco.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+
+
+def convert_to_coco_format(input_path):
+    """Convert input files to COCO compatible format."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="captioning")
+
+    captions = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+
+                question_id = res['sample_id']
+                caption = res['caption'].rstrip('.').lower()
+
+                captions.append({"image_id": question_id, "caption": caption})
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(captions, output_file, indent=4)
+
+    return output_file_path
+
+
+def coco_captioning_eval(input_path, groundtruth_file):
+    """Run COCO captioning evaluation."""
+    coco = COCO(groundtruth_file)
+    input_file = convert_to_coco_format(input_path)
+    coco_result = coco.loadRes(input_file)
+
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # Evaluate on the input subset of images.
+    coco_eval.params["image_id"] = coco_result.getImgIds()
+
+    coco_eval.evaluate()
+
+    print("========== COCO captioning scores ==========")
+    for metric, score in coco_eval.eval.items():
+        print(f"{metric} {score * 100:.3f}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path", type=str, required=True, help="Path to groundtruth file"
+    )
+    args = parser.parse_args()
+
+    coco_captioning_eval(args.input_path, args.groundtruth_path)
--- a/examples/multimodal/evaluate_mathvista.py
+++ b/examples/multimodal/evaluate_mathvista.py
+import argparse
+import json
+import re
+
+from evaluate_mmmu import get_input_output_paths
+from MMMU.mmmu.utils.eval_utils import parse_multi_choice_response
+from open_flamingo.eval.vqa_metric import VQAEval
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="MathVista")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(res)
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def extra_processing(text):
+    """Extra processing."""
+    # Max decimal point capped to 2 decimal point
+    regex = re.compile(r'^\d+\.\d+$')
+    decimal = regex.findall(text)
+
+    if len(decimal) > 0:
+        non_decimal = len(decimal[0].split(".")[0])
+
+        # if decimal values are all 0, trim them
+        decimal_digits = [int(d) for d in decimal[0].split(".")[1]]
+        if sum(decimal_digits) == 0:
+            text = decimal[0][:non_decimal]
+        else:
+            text = decimal[0][: non_decimal + 3]
+
+    # remove % and trailing .
+    text = text.replace("%", "")
+    if text[-1] == ".":
+        text = text[:-1]
+
+    return text
+
+
+def extract_answer(text):
+    """Extract answer."""
+    alphabet = re.findall(r'[a-zA-Z]+', text)
+    if len(alphabet) > 0 and "e+" not in text:
+        template = re.findall(r'answer is -*\d+\.*\d*', text)
+        if len(template) > 0:
+            text = template[0]
+
+            numbers = re.findall(r'-*\d+\.*\d*', text)
+            text = numbers[0] if len(numbers) > 0 else text
+
+    return text
+
+
+def compute_mathvista_accuracy(result_file):
+    """Compute MathVista accuracy."""
+    merged_results = json.load(open(result_file))
+
+    vqa = VQAEval(vqa=None, vqaRes=None)
+    acc = 0
+    for res in merged_results:
+        pred_ans = res["answer"]
+        if res["question_type"] == "multi_choice":
+            pred_ans = parse_multi_choice_response(pred_ans, res["all_choices"], res["index2ans"])
+        else:
+            pred_ans = vqa.processPunctuation(pred_ans)
+            pred_ans = vqa.processDigitArticle(pred_ans)
+            # Extra processing and extraction.
+            pred_ans = extra_processing(pred_ans)
+            pred_ans = extract_answer(pred_ans)
+
+        gt_ans = res["gt_answer"]
+        if isinstance(gt_ans, list):
+            assert len(gt_ans) == 1, f"Expected 1 groundtruth, got {gt_ans}"
+            gt_ans = gt_ans[0]
+
+        if res["question_type"] != "multi_choice":
+            gt_ans = vqa.processPunctuation(gt_ans)
+            gt_ans = vqa.processDigitArticle(gt_ans)
+
+            gt_ans = extra_processing(gt_ans)
+
+        if pred_ans == gt_ans:
+            acc += 1
+    acc = acc / len(merged_results) * 100
+    return acc
+
+
+def mathvista_eval(input_path):
+    """Run MathVista evaluation."""
+    result_file_path = merge_input_files(input_path)
+    acc = compute_mathvista_accuracy(result_file_path)
+    return acc
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    acc = mathvista_eval(args.input_path)
+
+    print(f"===== MathVista accuracy: {acc} =====")
--- a/examples/multimodal/evaluate_mmmu.py
+++ b/examples/multimodal/evaluate_mmmu.py
+import argparse
+import glob
+import json
+import os
+import re
+import subprocess
+
+from run_text_generation import get_output_path
+from config import EvaluationConfig
+
+
+def get_input_output_paths(input_path, task):
+    """Get all input files and an output path for a merged file."""
+    # Single input file.
+    if os.path.exists(input_path):
+        input_file_paths = [input_path]
+        output_file_path = input_path.replace(".jsonl", "-merged.json")
+    # Select multiple partitions and dp ranks.
+    else:
+        cfg = EvaluationConfig(task=task, output_path=input_path, partition_id="*")
+        pattern = get_output_path(cfg, dp_rank="*")
+        input_file_paths = glob.glob(pattern)
+
+        output_file_path = input_path + f"-{task}-merged.json"
+
+    return input_file_paths, output_file_path
+
+
+def convert_to_mmmu_format(input_path):
+    """Convert input files to MMMU compatible format."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, "MMMU")
+
+    output = dict()
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+
+                sample_id = res["sample_id"]
+                prediction = res["prediction"]
+
+                if res["question_type"] == "multiple-choice":
+                    from MMMU.mmmu.utils.eval_utils import parse_multi_choice_response
+
+                    prediction = parse_multi_choice_response(
+                        prediction, res["all_choices"], res["index2ans"]
+                    )
+
+                # MMMU eval script expects just a sample_id to prediction mapping.
+                output[sample_id] = prediction
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(output, output_file)
+
+    return output_file_path
+
+
+def mmmu_eval(input_path, groundtruth_path):
+    """Run MMMU evaluation."""
+    result_file = convert_to_mmmu_format(input_path)
+
+    # The MMMU repo has a script for running the actual evaluation but no API. So launching the script here.
+    output = subprocess.run(
+        [
+            "python",
+            "examples/multimodal/MMMU/mmmu/main_eval_only.py",
+            "--output_path",
+            result_file,
+            "--answer_path",
+            groundtruth_path,
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    print(output.stderr)
+    print(output.stdout)
+
+    m = re.search("'Overall': {'num': \d+, 'acc': (\d.\d+)}", output.stdout)
+
+    return float(m.group(1)) * 100.0
+
+
+def main():
+    """Run MMMU evaluation."""
+    # Using the validation groundtruth file from the MMMU repo by default. This assumes you have cloned the MMMU github repo here.
+    default_groundtruth_path = "examples/multimodal/MMMU/mmmu/answer_dict_val.json"
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path",
+        type=str,
+        default=default_groundtruth_path,
+        help="Path to groundtruth file. Defaults to the validation file in the MMMU repo.",
+    )
+    args = parser.parse_args()
+
+    avg_acc = mmmu_eval(args.input_path, args.groundtruth_path)
+
+    print(f"MMMU average accuracy: {avg_acc:.2f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/multimodal/evaluate_ocrbench.py
+++ b/examples/multimodal/evaluate_ocrbench.py
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(res)
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def compute_ocrbench_score(result_file):
+    """Compute OCRBench score."""
+    merged_results = json.load(open(result_file))
+
+    # OCRBench score calculation is adopted from https://github.com/Yuliang-Liu/MultimodalOCR/blob/1b7713f44c91f30f64efb6d3e494c416861ef15f/example.py#L1
+    # MIT License. Copyright (c) 2023 Yuliang Liu
+    score = {
+        "Regular Text Recognition": 0,
+        "Irregular Text Recognition": 0,
+        "Artistic Text Recognition": 0,
+        "Handwriting Recognition": 0,
+        "Digit String Recognition": 0,
+        "Non-Semantic Text Recognition": 0,
+        "Scene Text-centric VQA": 0,
+        "Doc-oriented VQA": 0,
+        "Doc-oriented VQA": 0,
+        "Key Information Extraction": 0,
+        "Handwritten Mathematical Expression Recognition": 0,
+    }
+
+    for res in merged_results:
+        predict = res["answer"]
+        answers = res["gt_answer"]
+
+        dataset_name = res["dataset_name"]
+        ocr_type = res["data_type"]
+
+        if dataset_name == "HME100k":
+            if isinstance(answers, list):
+                for j in range(len(answers)):
+                    answer = answers[j].strip().replace("\n", " ").replace(" ", "")
+                    predict = predict.strip().replace("\n", " ").replace(" ", "")
+                    if answer in predict:
+                        score[ocr_type] += 1
+            else:
+                answers = answers.strip().replace("\n", " ").replace(" ", "")
+                predict = predict.strip().replace("\n", " ").replace(" ", "")
+                if answers in predict:
+                    score[ocr_type] += 1
+        else:
+            if isinstance(answers, list):
+                for j in range(len(answers)):
+                    answer = answers[j].lower().strip().replace("\n", " ")
+                    predict = predict.lower().strip().replace("\n", " ")
+                    if answer in predict:
+                        score[ocr_type] += 1
+            else:
+                answers = answers.lower().strip().replace("\n", " ")
+                predict = predict.lower().strip().replace("\n", " ")
+                if answers in predict:
+                    score[ocr_type] += 1
+
+    recognition_score = (
+        score['Regular Text Recognition']
+        + score['Irregular Text Recognition']
+        + score['Artistic Text Recognition']
+        + score['Handwriting Recognition']
+        + score['Digit String Recognition']
+        + score['Non-Semantic Text Recognition']
+    )
+    final_score = (
+        recognition_score
+        + score['Scene Text-centric VQA']
+        + score['Doc-oriented VQA']
+        + score['Key Information Extraction']
+        + score['Handwritten Mathematical Expression Recognition']
+    )
+    result_log = f"""###########################OCRBench##############################
+Text Recognition(Total 300): {recognition_score}
+------------------Details of Recognition Score-------------------
+Regular Text Recognition(Total 50): {score['Regular Text Recognition']}
+Irregular Text Recognition(Total 50): {score['Irregular Text Recognition']}
+Artistic Text Recognition(Total 50): {score['Artistic Text Recognition']}
+Handwriting Recognition(Total 50): {score['Handwriting Recognition']}
+Digit String Recognition(Total 50): {score['Digit String Recognition']}
+Non-Semantic Text Recognition(Total 50): {score['Non-Semantic Text Recognition']}
+----------------------------------------------------------------
+Scene Text-centric VQA(Total 200): {score['Scene Text-centric VQA']}
+----------------------------------------------------------------
+Doc-oriented VQA(Total 200): {score['Doc-oriented VQA']}
+----------------------------------------------------------------
+Key Information Extraction(Total 200): {score['Key Information Extraction']}
+----------------------------------------------------------------
+Handwritten Mathematical Expression Recognition(Total 100): {score['Handwritten Mathematical Expression Recognition']}
+----------------------Final Score-------------------------------
+Final Score(Total 1000): {final_score}"""
+
+    return result_log, final_score
+
+
+def ocrbench_eval(input_path):
+    """Run OCRBench evaluation."""
+    result_file_path = merge_input_files(input_path)
+    result_log, score = compute_ocrbench_score(result_file_path)
+    return result_log, score
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    result_log, _ = ocrbench_eval(args.input_path)
+
+    print(result_log)
--- a/examples/multimodal/evaluate_textvqa.py
+++ b/examples/multimodal/evaluate_textvqa.py
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from evaluate_vqav2 import compute_vqa_accuracy
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(
+                    {
+                        "question_id": res["sample_id"],
+                        "answer": res["answer"],
+                        "gt_answer": res["gt_answer"],
+                    }
+                )
+
+    # Make order deterministic.
+    # results = sorted(results, key=lambda d: d["question_id"])
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def textvqa_eval(input_path):
+    """Run TextVQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file_path, task="TextVQA")
+    return avg_acc
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = textvqa_eval(args.input_path)
+
+    print(f"===== TextVQA Accuracy {avg_acc:.2f}% =====")
--- a/examples/multimodal/evaluate_vqav2.py
+++ b/examples/multimodal/evaluate_vqav2.py
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from open_flamingo.eval.vqa_metric import VQAEval
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                res["question_id"] = res["sample_id"]
+
+                results.append(res)
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def is_number(n: str):
+    """Check if input is a number."""
+    try:
+        float(n)
+        return True
+    except ValueError:
+        return False
+
+
+def compute_vqa_accuracy(result_file, task):
+    """Compute VQA accuracy."""
+    merged_results = json.load(open(result_file))
+
+    vqa = VQAEval(vqa=None, vqaRes=None)
+    all_acc = []
+    for res in merged_results:
+        pred = res["answer"]
+        pred = vqa.processPunctuation(pred)
+        pred = vqa.processDigitArticle(pred)
+
+        gt = res["gt_answer"]
+        gt = [vqa.processPunctuation(ans) for ans in gt]
+        gt = [vqa.processDigitArticle(ans) for ans in gt]
+
+        # ChartQA uses relaxed accuracy:
+        # "We consider an answer to be correct if it is within 5% of the gold answer.
+        #  For non-numeric answers, we still need an exact match to consider an answer to be correct."
+        if task == "ChartQA":
+            acc = 0.0
+            assert len(gt) == 1, "expected exactly one groundtruth answer."
+            gt = gt[0]
+
+            if is_number(pred) and is_number(gt):
+                pred = float(pred)
+                gt = float(gt)
+                if pred >= (gt * 0.95) and pred <= (gt * 1.05):
+                    acc = 1.0
+            elif pred == gt:
+                acc = 1.0
+
+            all_acc.append(acc)
+        elif task in ("VQAv2", "TextVQA"):
+            num_match = sum([pred == ans for ans in gt])
+            acc = min(1.0, num_match / 3.0)
+            all_acc.append(acc)
+        elif task == "AI2D":
+            assert len(gt) == 1, f"Expected exactly 1 GT, got {gt}"
+            acc = pred == gt[0]
+            all_acc.append(acc)
+        else:
+            raise NotImplementedError(f"unknown task {task}")
+
+    acc_avg = sum(all_acc) / len(all_acc) * 100
+
+    return acc_avg
+
+
+def vqav2_eval(input_path):
+    """Run VQAv2 evaluation."""
+    result_file = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file, task="VQAv2")
+    return avg_acc
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = vqav2_eval(args.input_path)
+
+    print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====")
--- a/examples/multimodal/evaluation_datasets.py
+++ b/examples/multimodal/evaluation_datasets.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Evaluation datasets."""
+import glob
+import itertools
+import json
+import os
+import re
+from collections import defaultdict
+
+import numpy as np
+import torch
+from image_processing import get_visual_transform
+from PIL import Image
+
+from megatron.training import print_rank_0
+
+
+def _get_partition_bounds(
+    total_num_samples, num_samples_per_partition, num_partitions, partition_id
+):
+    if num_samples_per_partition == 0:
+        samples_per_partition = [
+            int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
+        ]
+        return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
+    return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
+
+
+class VQADataset(torch.utils.data.Dataset):
+    """VQA evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        keys,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        samples = json.load(open(gt_path, encoding='utf-8'))
+        if "data" in samples:
+            samples = samples["data"]
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(samples), num_samples_per_partition, num_partitions, partition_id
+            )
+            samples = samples[lb:ub]
+
+        self._keys = keys
+        self._samples = samples
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._samples)
+
+    def __getitem__(self, idx):
+        sample = self._samples[idx]
+
+        img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
+        if not os.path.exists(img_file):
+            img_file += ".jpg"
+
+            if not os.path.exists(img_file):
+                img_file = img_file.replace('.jpg', '.png')
+
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        sample_id = idx
+        if "sample_id" in self._keys:
+            sample_id = sample[self._keys["sample_id"]]
+
+        metadata = ""  # Not used.
+
+        return (
+            torch.stack(imgs),
+            tile_count,
+            sample_id,
+            sample[self._keys["question"]],
+            sample[self._keys["answer"]],
+            metadata,
+        )
+
+
+class CaptioningDataset(torch.utils.data.Dataset):
+    """Captioning evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        image_files = sorted(glob.glob(input_image_path + "/*"))
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(image_files), num_samples_per_partition, num_partitions, partition_id
+            )
+            image_files = image_files[lb:ub]
+
+        gts = json.load(open(gt_path))
+        answers = defaultdict(list)
+        for gt in gts["annotations"]:
+            answers[gt["image_id"]].append(gt['caption'])
+
+        self._image_files = image_files
+        self._answers = answers
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._image_files)
+
+    def __getitem__(self, idx):
+        img_file = self._image_files[idx]
+        image_id = int(img_file.split("_")[-1].split(".")[0])
+
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        question = ""  # Fixed for all samples.
+        metadata = ""  # Not used.
+
+        return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
+
+
+class MMMUDataset(torch.utils.data.Dataset):
+    """MMMU evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        single_image,
+        vision_model_type,
+    ):
+        import datasets
+        from MMMU.mmmu.utils.data_utils import CAT_SHORT2LONG, load_yaml
+
+        # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
+        all_mmmu_datasets = []
+
+        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+
+        for subject in CAT_SHORT2LONG.values():
+            # Use a local copy of the dataset if exists (can be faster) or the HF one.
+            if os.path.exists(input_image_path):
+                subject_dataset = datasets.load_dataset(
+                    os.path.join(input_image_path, subject),
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                    verification_mode="no_checks",
+                )
+            else:
+                subject_dataset = datasets.load_dataset(
+                    "MMMU/MMMU",
+                    subject,
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                )
+
+            all_mmmu_datasets.append(subject_dataset)
+
+        dataset = datasets.concatenate_datasets(all_mmmu_datasets)
+
+        dataset = [s for s in dataset if s['id'].startswith("val")]
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(dataset), num_samples_per_partition, num_partitions, partition_id
+            )
+            dataset = dataset[lb:ub]
+
+        # Using the LLaVA config from the MMMU repo.
+        config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
+        for k, v in config.items():
+            if isinstance(v, list):
+                assert len(v) == 1, "only one value supported."
+                config[k] = v[0]
+
+        self._config = config
+
+        self._dataset = dataset
+
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._single_image = single_image
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, idx):
+        from MMMU.mmmu.utils.data_utils import construct_prompt, process_single_sample
+
+        sample = self._dataset[idx]
+
+        # Use the single image approach from the MMMU repo.
+        if self._single_image:
+            sample = process_single_sample(sample)
+            sample = construct_prompt(sample, self._config)
+
+            img = sample["image"]
+            sample_imgs = get_visual_transform(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+                vision_model_type=self._vision_model_type,
+            )
+            sample_num_tiles = [len(sample_imgs)]
+        else:
+            sample = construct_prompt(sample, self._config)
+
+            sample_imgs = []
+            sample_num_tiles = []
+
+            img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                # Note: Only replace the current image tag.
+                sample["final_input_prompt"] = sample["final_input_prompt"].replace(
+                    img_str, "<image>", 1
+                )
+
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )  # List of tiles.
+
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+
+            # Sanity check.
+            for i in range(1, 8):
+                assert (
+                    f"<image {i}>" not in sample["final_input_prompt"]
+                ), "prompt contains unhandled image tags"
+
+        # MMMU specific metadata.
+        metadata = {"question_type": sample["question_type"]}
+        if sample["question_type"] == "multiple-choice":
+            metadata["index2ans"] = sample["index2ans"]
+            metadata["all_choices"] = sample["all_choices"]
+
+        prompt = sample['final_input_prompt']
+        if self._single_image:
+            for i in range(8):
+                prompt = prompt.replace(f"<image {i}>", "")
+            prompt = f"<image>\n{prompt}"
+
+        tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
+
+        return (
+            torch.stack(sample_imgs),
+            tile_count,
+            sample["id"],
+            prompt,
+            sample["answer"],
+            metadata,
+        )
+
+
+class VideoMMMEDataset(torch.utils.data.Dataset):
+    "Video MME evaluation dataset."
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        num_frames,
+        vision_model_type,
+    ):
+        ground_truth_original = json.load(open(gt_path))
+        ground_truth = []
+        for gt in ground_truth_original:
+            video_path = gt["url"]
+            video_path = video_path.replace("https://www.youtube.com/watch?v=", "")
+            video_path = video_path.replace("https://m.youtube.com/watch?v=", "")
+            video_path = os.path.join(input_image_path, video_path + ".mp4")
+            if not os.path.exists(video_path):
+                continue
+            gt["video_path"] = video_path
+            ground_truth.append(gt)
+
+        ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
+        print_rank_0(f"Found {len(ground_truth)} videos to process.")
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(ground_truth), num_samples_per_partition, num_partitions, partition_id
+            )
+            ground_truth = ground_truth[start_idx:end_idx]
+
+        self._ground_truth = ground_truth
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._num_frames = num_frames
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._ground_truth)
+
+    def __getitem__(self, idx):
+        from torchvision.io import read_video
+
+        gt = self._ground_truth[idx]
+
+        video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
+        video = video.numpy()
+        selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
+        video_frames = video[selected_frames]
+        if self._num_frames == 1:
+            video_frames = video_frames[None]
+
+        imgs = list(
+            itertools.chain.from_iterable(
+                get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    self._max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )
+                for img in video_frames
+            )
+        )
+
+        for question in gt["questions"]:
+            # Very hacky, but we essentially re-create gt holding only the
+            # question of interest. This is the make this generation script
+            # compatible with the Video MME evaluation script.
+            question_dict = {
+                "video_id": gt["video_id"],
+                "duration_category": gt["duration_category"],
+                "video_category": gt["video_category"],
+                "video_subcategory": gt["video_subcategory"],
+                "url": gt["url"],
+                "questions": [question],
+            }
+
+        num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
+
+        answer = ""
+        metadata = ""
+
+        return (
+            torch.stack(imgs),
+            num_tiles,
+            question["question_id"],
+            question_dict,
+            answer,
+            metadata,
+        )
+
+
+class OCRBenchDataset(torch.utils.data.Dataset):
+    """OCRBench evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        gt = json.load(open(gt_path, encoding='utf-8'))
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(gt), num_samples_per_partition, num_partitions, partition_id
+            )
+            gt = gt[start_idx:end_idx]
+
+        self._input_image_path = input_image_path
+        self._gt = gt
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._gt)
+
+    def __getitem__(self, idx):
+        img_path = os.path.join(self._input_image_path, self._gt[idx]['image_path'])
+
+        img = Image.open(img_path)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        metadata = {
+            "dataset_name": self._gt[idx]["dataset_name"],
+            "data_type": self._gt[idx]["type"],
+        }
+
+        return (
+            torch.stack(imgs),
+            tile_count,
+            idx,
+            self._gt[idx]["question"],
+            self._gt[idx]["answers"],
+            metadata,
+        )
+
+
+class MathVistaDataset(torch.utils.data.Dataset):
+    """MathVista evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        import datasets
+
+        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+
+        if os.path.exists(input_image_path):
+            dataset = datasets.load_dataset(
+                input_image_path, cache_dir=hf_datasets_cache, verification_mode="no_checks"
+            )
+        else:
+            dataset = datasets.load_dataset(
+                "AI4Math/MathVista", split="testmini", cache_dir=hf_datasets_cache
+            )
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(dataset), num_samples_per_partition, num_partitions, partition_id
+            )
+            dataset = dataset[start_idx:end_idx]
+
+        self._dataset = dataset
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._dataset["pid"])
+
+    def __getitem__(self, idx):
+        # Already a PIL object.
+        img = self._dataset['decoded_image'][idx]
+
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        question_id = self._dataset["pid"][idx]
+        question = self._dataset["question"][idx]
+        question_type = self._dataset["question_type"][idx]  # free_form or multi_choice
+        query = self._dataset["query"][idx]
+        choices = self._dataset["choices"][idx]
+        answer = self._dataset["answer"][idx]
+
+        if question_type == 'multi_choice':
+            start_chr = 'A'
+            choices_str = ''
+            index2ans = {}
+            all_choices = []
+            for choice in choices:
+                all_choices.append(start_chr)
+                index2ans[start_chr] = choice
+                choices_str += f"{start_chr}. {choice}\n"
+                start_chr = chr(ord(start_chr) + 1)
+
+            question = question + '\n' + choices_str
+            question = question + "Answer with the option's letter from the given choices directly."
+            answer = chr(ord('A') + choices.index(answer))
+        else:
+            question = query.replace("Hint: ", "")
+            index2ans = {}
+            all_choices = []
+
+        metadata = {
+            "question_type": question_type,
+            "index2ans": index2ans,
+            "all_choices": all_choices,
+        }
+
+        return torch.stack(imgs), tile_count, question_id, question, answer, metadata
+
+
+class AI2DDataset(torch.utils.data.Dataset):
+    """AI2D evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        no_mask,
+        vision_model_type,
+    ):
+        with open(gt_path, 'r') as f:
+            jsonl = list(f)
+
+        gt = [json.loads(json_str) for json_str in jsonl]
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(gt), num_samples_per_partition, num_partitions, partition_id
+            )
+            gt = gt[start_idx:end_idx]
+
+        self._gt = gt
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._no_mask = no_mask
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._gt)
+
+    def __getitem__(self, idx):
+        img_path = os.path.join(self._input_image_path, self._gt[idx]['image'])
+        if self._no_mask:
+            img_path.replace("AI2D_TEST", "AI2D_TEST_NO_MASK_IMAGES")
+
+        img = Image.open(img_path)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        metadata = ""  # Not used.
+
+        return (
+            torch.stack(imgs),
+            tile_count,
+            self._gt[idx]["question_id"],
+            self._gt[idx]["question"],
+            self._gt[idx]["answer"],
+            metadata,
+        )
+
+
+def get_evaluation_dataset(
+    task,
+    input_image_path,
+    gt_path,
+    img_h,
+    img_w,
+    use_tiling,
+    max_num_tiles,
+    use_thumbnail,
+    num_samples_per_partition,
+    num_partitions,
+    partition_id,
+    num_frames,
+    vision_model_type,
+):
+    """Get an evaluation dataset."""
+    if task == "TextVQA":
+        keys = {
+            "image_id": "image_id",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answers",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "VQAv2":
+        keys = {
+            "image_id": "image",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answer",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "ChartQA":
+        keys = {"image_id": "imgname", "question": "query", "answer": "label"}
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "captioning":
+        dataset = CaptioningDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == 'MMMU':
+        # Note: single_image=True uses only one image like in the MMMU repo example.
+        # single_image=False uses all images in the sample.
+        dataset = MMMUDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            single_image=True,
+            vision_model_type=vision_model_type,
+        )
+    elif task == "VideoMME":
+        dataset = VideoMMMEDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            num_frames,
+            vision_model_type,
+        )
+    elif task == "OCRBench":
+        dataset = OCRBenchDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "MathVista":
+        dataset = MathVistaDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "AI2D":
+        dataset = AI2DDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            no_mask=False,
+            vision_model_type=vision_model_type,
+        )
+    else:
+        raise NotImplementedError(f"unsupported task {task}")
+
+    return dataset
--- a/examples/multimodal/image_processing.py
+++ b/examples/multimodal/image_processing.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved. Except portions as noted which are Copyright (c) 2023 OpenGVLab and licensed under the MIT license found in LICENSE.
+from torchvision import transforms as T
+from torchvision.transforms import Compose
+from torchvision.transforms.functional import InterpolationMode
+
+
+IMAGENET_PIXEL_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_PIXEL_STD = [0.229, 0.224, 0.225]
+SIGLIP_PIXEL_MEAN = [0.5, 0.5, 0.5]
+SIGLIP_PIXEL_STD = [0.5, 0.5, 0.5]
+CLIP_PIXEL_MEAN = [0.48145466, 0.4578275, 0.40821073]
+CLIP_PIXEL_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+pixel_statistics = {
+    "clip": (CLIP_PIXEL_MEAN, CLIP_PIXEL_STD),
+    "siglip": (SIGLIP_PIXEL_MEAN, SIGLIP_PIXEL_STD),
+    "internvit": (IMAGENET_PIXEL_MEAN, IMAGENET_PIXEL_STD),
+}
+
+
+def get_visual_transform(img, img_h, img_w, use_tiling=False, max_num_tiles=1, use_thumbnail=False, augment=False, vision_model_type="clip"):
+    pixel_mean, pixel_std = pixel_statistics[vision_model_type]
+
+    assert not augment, "Image augmentation not implemented."
+    transform = build_transform(img_h, pixel_mean, pixel_std, vision_model_type)
+
+    if use_tiling:
+        assert img_h == img_w, "dynamic tiling expects equal tile height and width"
+        imgs = dynamic_preprocess(img, min_num=1, max_num=max_num_tiles, image_size=img_h, use_thumbnail=use_thumbnail)
+        imgs = [transform(img) for img in imgs]
+    else:
+        imgs = [transform(img)]
+
+    return imgs
+
+
+# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L685
+# Copyright (c) 2023 OpenGVLab.
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+
+
+# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L702
+# Copyright (c) 2023 OpenGVLab.
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+# Based on https://github.com/openai/CLIP/blob/dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1/clip/clip.py#L79
+# and https://github.com/OpenGVLab/InternVL/blob/aa521e6eb1df4cf153aa4118fcf13e673c055d46/internvl_chat/internvl/train/dataset.py#L276
+def build_transform(input_size, pixel_mean, pixel_std, vision_model_type):
+    if vision_model_type in ("siglip", "internvit"):
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=pixel_mean, std=pixel_std)
+        ])
+    elif vision_model_type == "clip":
+        transform = Compose([
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.ToTensor(),
+            T.Normalize(mean=pixel_mean, std=pixel_std),
+        ])
+    else:
+        raise NotImplementedError(f"image processing not defined for vision model {vision_model_type}")
+
+    return transform
--- a/examples/multimodal/layer_specs.py
+++ b/examples/multimodal/layer_specs.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
+    LNImpl = WrappedTorchNorm
+
+
+def get_layer_spec(is_vit, normalization) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+    if normalization == "LayerNorm":
+        norm = LNImpl
+    elif normalization == "RMSNorm":
+        if HAVE_TE:
+            norm = TENorm
+        else:
+            version = torch.__version__.split('.')
+            version_geq_2_4 = (
+                int(TORCH_VERSION[0]) > 2
+                or (
+                    int(TORCH_VERSION[0]) == 2
+                    and int(TORCH_VERSION[1]) >= 4
+                )
+            )
+            assert version_geq_2_4, "Torch version >= 2.4.0 is required for RMSNorm"
+            if HAVE_APEX:
+                warnings.warn(f'Apex does not support RMSNorm. Falling back to Torch Norm')
+            norm = WrappedTorchNorm
+    else:
+        raise RuntimeError("unknown normalization", normalization)
+
+    mlp = get_mlp_module_spec(use_te=False)  # doesn't include norm.
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=norm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=norm,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_layer_spec_te(is_vit=False) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+
+    mlp = get_norm_mlp_module_spec_te()
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=IdentityOp,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+
+
+def get_norm_mlp_module_spec_te() -> ModuleSpec:
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+        ),
+    )
--- a/examples/multimodal/manual_prompts.json
+++ b/examples/multimodal/manual_prompts.json
+{
+    "COMMENT": "Sources for these prompts include https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
+    "Captioning": {
+        "raw": [
+            "Can you briefly explain what you see in the image?",
+            "Describe what's happening in this image in one short sentence.",
+            "Write a short caption that accurately represents the content of this image.",
+            "Please generate a descriptive caption for the image provided.",
+            "How would you summarize the scene depicted in the picture in short?",
+            "Describe the image briefly.",
+            "Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
+            "Create a concise caption that accurately describes the main elements in the image provided.",
+            "Write a brief, yet comprehensive, description of the image.",
+            "Describe the image in a clear and concise manner.",
+            "For the given image, provide a one-sentence summary that captures the most important details.",
+            "Generate a short caption for the picture.",
+            "Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
+            "Provide a concise and informative caption for the image, focusing on the primary subjects.",
+            "Write a clear description of the image, make sure the key features are well covered.",
+            "Offer a succinct explanation of the picture presented."
+        ]
+    },
+    "CaptioningPretraining": {
+        "raw": [
+            "Generate a short caption of the image.",
+            "Describe the image concisely.",
+            "Provide a brief description of the given image."
+        ],
+        "llava": [
+            "Give a brief description of image.",
+            "Give a brief description of the image.",
+            "Provide a brief description of the given image.",
+            "Provide a one-sentence caption for the provided image.",
+            "Write a terse but informative summary of the picture.",
+            "Describe the image concisely.",
+            "Generate a clear and concise summary of the photo."
+        ]
+    },
+    "OCR": {
+        "raw": [
+            "Can you read the text from image and output here?",
+            "Extract and document the text from the provided image.",
+            "Converting the text embedded in this image into a readable document.",
+            "Transcribe all the text you find.",
+            "Can you extract all visible text from the image here?"
+        ]
+    }
+}
--- a/examples/multimodal/model.py
+++ b/examples/multimodal/model.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import warnings
+from copy import deepcopy
+
+import torch
+from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
+from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec, get_norm_mlp_module_spec_te
+
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN, LLaVAModel
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.training import get_args, get_tokenizer, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+
+
+def model_provider(
+    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
+) -> LLaVAModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
+            will live on only a subset of the pipeline stages (specifically, only the first stage).
+        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
+            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
+        parallel_output (bool): Enable parallel model output.
+
+    Returns:
+        model: A multimodal model.
+    """
+    args = get_args()
+    assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."
+    assert args.encoder_pipeline_model_parallel_size <= 1, "LLaVA does not support pp>1 for encoder on it's own pipeline rank"
+
+    use_te = args.use_te
+
+    print_rank_0('building a multimodal model ...')
+
+    num_image_embeddings = get_num_image_embeddings(
+        args.img_h,
+        args.img_w,
+        args.patch_dim,
+        args.vision_model_type,
+        args.disable_vision_class_token,
+        1,
+        args.pixel_shuffle,
+        args.use_tile_tags,
+    )
+    old_seq_length = args.seq_length
+    args.seq_length = args.encoder_seq_length = num_image_embeddings
+    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
+        warnings.warn(
+            f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
+        )
+
+    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
+
+    assert (
+        args.decoder_seq_length is not None
+    ), "Please provide --decoder-seq-length to set the language model sequence length"
+    assert (
+        args.decoder_seq_length > max_num_image_embeddings
+    ), "Language model sequence length must be greater than the maximum number of image embeddings"
+    if args.decoder_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.decoder_seq_length
+        warnings.warn(
+            f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length"
+        )
+
+    base_config = core_transformer_config_from_args(get_args())
+    base_config.language_model_type = args.language_model_type
+    base_config.vision_model_type = args.vision_model_type
+    base_config.calculate_per_token_loss = True
+
+    language_config = deepcopy(base_config)
+    language_config = get_language_model_config(language_config)
+
+    if use_te:
+        language_transformer_layer_spec = get_layer_spec_te(
+            is_vit=False
+        )  # TENorm detects LayerNorm/RMS automatically.
+    else:
+        language_transformer_layer_spec = get_layer_spec(
+            is_vit=False, normalization=language_config.normalization
+        )
+
+    vision_config = deepcopy(base_config)
+    vision_config = get_vision_model_config(
+        vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling
+    )
+
+    vision_model_type = args.vision_model_type
+    if vision_model_type in ["clip", "siglip"]:
+        if use_te:
+            vision_transformer_layer_spec = get_layer_spec_te(
+                is_vit=True
+            )  # TENorm detects LayerNorm/RMS automatically.
+        else:
+            vision_transformer_layer_spec = get_layer_spec(
+                is_vit=True, normalization=vision_config.normalization
+            )
+    elif vision_model_type == "internvit":
+        from nvlm.internvit import get_internvit_layer_spec
+        vision_transformer_layer_spec = get_internvit_layer_spec(use_te=use_te)
+    else:
+        raise RuntimeError("unsupported vision model type", vision_model_type)
+
+    vision_projection_config = deepcopy(base_config)
+    vision_projection_config = get_vision_projection_config(
+        vision_projection_config, language_config.hidden_size
+    )
+
+    # --encoder-pipeline-model-parallel-size 1 will enable a separate pipeline stage for the vision model.
+    if args.encoder_pipeline_model_parallel_size > 0:
+        assert (
+            args.encoder_pipeline_model_parallel_size == 1
+        ), "vision model and projection can only live on 1 pipeline stage."
+
+        if args.encoder_tensor_model_parallel_size > 0:
+            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
+            vision_projection_config.tensor_model_parallel_size = (
+                args.encoder_tensor_model_parallel_size
+            )
+
+    # Make sure vision model pipeline parallel size is not inherited from the language model pipeline parallel size.
+    # 0 is not a valid for the config value, hence max(1, ).
+    vision_config.pipeline_model_parallel_size = max(1, args.encoder_pipeline_model_parallel_size)
+    vision_projection_config.pipeline_model_parallel_size = vision_config.pipeline_model_parallel_size
+
+    # Make sure the vision model does not inherit first and last pipeline num layers from the language model.
+    vision_config.first_pipeline_num_layers = vision_config.last_pipeline_num_layers = None
+
+    if vision_projection_config.normalization:
+        vision_projection_layer_spec = get_norm_mlp_module_spec_te().submodules
+    else:
+        vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
+
+    tokenizer = get_tokenizer()
+    image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+
+    tile_tags = _get_tile_tags(args, tokenizer)
+
+    model = LLaVAModel(
+        language_transformer_config=language_config,
+        language_transformer_layer_spec=language_transformer_layer_spec,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.decoder_seq_length,
+        vision_transformer_config=vision_config,
+        vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.disable_vision_class_token,
+        vision_projection_config=vision_projection_config,
+        vision_projection_layer_spec=vision_projection_layer_spec,
+        vision_projection_type="mlp",
+        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
+        parallel_output=parallel_output,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
+        pre_process=pre_process,
+        post_process=post_process,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        img_h=args.img_h,
+        img_w=args.img_w,
+        patch_dim=args.patch_dim,
+        language_rotary_base=args.rotary_base,
+        language_rope_scaling=args.use_rope_scaling,
+        image_token_index=image_token_index,
+        pixel_shuffle=args.pixel_shuffle,
+        tile_tags=tile_tags,
+    )
+
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+
+    return model
+
+
+def _get_tile_tags(args, tokenizer):
+    """Tile tags are used in NVLM to surround image tiles with text tags."""
+    if not args.use_tile_tags:
+        return None
+
+    # We expect the tokenized length of the tags is same.
+    thumbnail_tag_text = "<tile_global_thumbnail>"
+    if args.tokenizer_prompt_format == "nvlm-yi-34b":
+        thumbnail_tag_text = "<tile_global>"
+
+    assert args.max_num_tiles <= 6, "Up to 6 tile tags used"
+    tile_tags_text = [f"<tile_{i}>" for i in range(1, args.max_num_tiles + 1)] + [thumbnail_tag_text]
+
+    start_idx = 0
+    if tokenizer._prompt_config.has_bos:
+        start_idx = 1
+
+    # Convert to tokens [num_tiles, tile_seq_len].
+    tile_tags = [tokenizer.tokenize(t)[start_idx:] for t in tile_tags_text]
+
+    return tile_tags
--- a/examples/multimodal/model_converter/clip_converter.py
+++ b/examples/multimodal/model_converter/clip_converter.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+
+import torch
+
+import clip
+
+
+def convert(download_root, output_path, tensor_parallel_size, use_te):
+    device = "cuda"
+
+    model, _ = clip.load("ViT-L/14@336px", device=device, download_root=download_root)
+
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+    # Indices from mapping pytorch multihead attention to megatron.
+    kv_channels = 64
+    hidden_dim = 1024
+    num_heads = 16
+    indices = []
+    for i in range(num_heads):
+        lb = i * kv_channels
+        ub = (i + 1) * kv_channels
+        indices.append(torch.arange(lb, ub, dtype=torch.int))
+        indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int))
+        indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int))
+
+    indices = torch.cat(indices)
+
+    for name, tensor in state_dict.items():
+        # Skip text model.
+        if "visual" not in name:
+            continue
+
+        # Skip final layers not used in our model.
+        if name == "visual.proj" or "ln_post" in name:
+            continue
+
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+        if new_tensor.dtype == torch.float16:
+            new_tensor = new_tensor.to(torch.float32)
+
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+
+        if "class_embedding" in name:
+            new_name = "class_token"
+            # Our model uses class token that is expanded to input dimensions already.
+            new_tensor = new_tensor.expand(1, 1, -1)
+        elif "positional_embedding" in name:
+            new_name = "position_embeddings.weight"
+        elif "conv1" in name:
+            new_name = "conv1.weight"
+        elif "ln_pre.weight" in name:
+            new_name = "ln_pre.weight"
+        elif "ln_pre.bias" in name:
+            new_name = "ln_pre.bias"
+        elif "transformer.resblocks" in name:
+            layer_idx = name.split(".")[3]
+            base = f"decoder.layers.{layer_idx}"
+
+            if "attn.in_proj_weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.in_proj_bias" in name:
+                new_name = f"{base}.self_attention.linear_qkv.bias"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.out_proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                chunk_dim = 1
+            elif "attn.out_proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "ln_1.weight" in name:
+                new_name = f"{base}.input_layernorm.weight"
+                if use_te:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight"
+            elif "ln_1.bias" in name:
+                new_name = f"{base}.input_layernorm.bias"
+                if use_te:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias"
+            elif "mlp.c_fc.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.c_fc.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.c_proj.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.c_proj.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "ln_2.weight" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+                if use_te:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_weight"
+            elif "ln_2.bias" in name:
+                new_name = f"{base}.pre_mlp_layernorm.bias"
+                if use_te:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_bias"
+
+        assert new_name != "", f"unexpected layer name {name}"
+
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+        for i in range(tensor_parallel_size):
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+            # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+            extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+            is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+            if use_te and is_extra_state_layer:
+                layer = new_name.split(".")[-2]
+                if layer in extra_state_layers:
+                    extra_state_name = (
+                        new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                    )  # Replace the weight name.
+                    new_state_dicts[i]["model"][extra_state_name] = None
+
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
+        os.makedirs(output_dir_tp)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert OpenAI CLIP VIT weights to megatron format.
+
+
+Example usage:
+python clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--download-root", type=str, required=True, help="Download folder for OpenAI CLIP weights"
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
+    )
+    parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
+
+    args = parser.parse_args()
+
+    convert(args.download_root, args.output, args.tensor_parallel_size, args.use_te)
+
+    print("done.")
--- a/examples/multimodal/model_converter/internvit_converter.py
+++ b/examples/multimodal/model_converter/internvit_converter.py
+import argparse
+import os
+
+import torch
+from transformers import AutoModel
+
+
+def convert(model_name, output_path, tensor_parallel_size, use_te):
+    """Convert InternViT HF checkpoint to mcore."""
+    hf_model = AutoModel.from_pretrained(
+        model_name,
+        trust_remote_code=True
+    )
+
+    hf_state_dict = hf_model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+    hidden_size = 3200
+    num_heads = 25
+    dim = 128
+
+    order = torch.ones(3 * hidden_size).long()
+
+    for j in range(num_heads):
+        for i in range(dim):
+            order[i + dim*3*j] = j*dim+i
+            order[dim + i + dim*3*j] = j*dim+i+num_heads*dim
+            order[dim*2 + i + dim*3*j] = j*dim+i+num_heads*dim*2
+
+    for name, tensor in hf_state_dict.items():
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+
+        if "embeddings.class_embedding" in name:
+            new_name = "class_token"
+        elif "embeddings.patch_embedding.weight" in name:
+            new_name = "conv1.weight"
+        elif "embeddings.patch_embedding.bias" in name:
+            new_name = "conv1.bias"
+        elif "embeddings.position_embedding" in name:
+            new_name = "position_embeddings.weight"
+            new_tensor = new_tensor.squeeze(0)
+        elif "encoder.layers" in name:
+            layer_idx = name.split(".")[2]
+
+            base = f"decoder.layers.{layer_idx}"
+
+            head_dim = 128
+
+            if tensor_parallel_size == 1:
+                num_padded_heads = 25
+            elif tensor_parallel_size == 8:
+                # Note: 25 is not divisible by 8 and we don't currently support uneven heads split with tensor parallelism.
+                # So we pad with dummy all-zero heads. Please use a nice even number of attention heads in your model.
+                num_padded_heads = 32
+            else:
+                raise NotImplementedError("invalid tensor parallel size value:", tensor_parallel_size)
+
+            if "ls1" in name:
+                new_name = f"{base}.ls1"
+            elif "ls2" in name:
+                new_name = f"{base}.ls2"
+            elif "attn.qkv.weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                num_tensors = 3
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros((padded_dim, new_tensor.shape[-1]), dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0], :] = new_tensor[order]
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.q_norm.weight" in name:
+                new_name = f"{base}.self_attention.q_layernorm.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.k_norm.weight" in name:
+                new_name = f"{base}.self_attention.k_layernorm.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros((new_tensor.shape[0], padded_dim), dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:, :new_tensor.shape[-1]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 1
+            elif "attn.proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "mlp.fc1.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.fc1.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.fc2.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.fc2.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "norm1" in name:
+                new_name = f"{base}.input_layernorm.weight"
+            elif "norm2" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+            else:
+                raise RuntimeError("unexpected transformer layer name", name)
+        else:
+            raise RuntimeError("unexpected layer name", name)
+
+        assert new_name != "", f"unexpected layer name {name}"
+
+        # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+        extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+        is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+        if use_te and is_extra_state_layer:
+            layer = new_name.split(".")[-2]
+            if layer in extra_state_layers:
+                extra_state_name = (
+                    new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                )  # Replace the weight name.
+                for i in range(tensor_parallel_size):
+                    new_state_dicts[i]["model"][extra_state_name] = None
+
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+        for i in range(tensor_parallel_size):
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, f"iter_0000001/mp_rank_0{i}")
+        os.makedirs(output_dir_tp, exist_ok=True)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+        print("saved file", output_path_tp)
+
+    print("done")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="InternVIT HuggingFace to Mcore converter")
+    parser.add_argument("--model-name", type=str, default="OpenGVLab/InternViT-6B-448px-V1-5", help="Model name in HuggingFace")
+    parser.add_argument("--output-dir", type=str, required=True, help="Output directory for the mcore model.")
+    parser.add_argument("--use-te", action="store_true", default=True)
+    parser.add_argument("--tensor-parallel-size", type=int, required=True)
+
+    args = parser.parse_args()
+
+    convert(args.model_name, args.output_dir, args.tensor_parallel_size, args.use_te)
--- a/examples/multimodal/model_converter/siglip_converter.py
+++ b/examples/multimodal/model_converter/siglip_converter.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+from transformers import PaliGemmaForConditionalGeneration
+import torch
+
+
+def convert(output_path, tensor_parallel_size, use_te):
+    device = "cuda"
+
+    model_id = "google/paligemma-3b-pt-448"
+    model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
+
+    model = model.to(device)
+
+    print(model.config)
+    for name, tensor in model.state_dict().items():
+        if "vision_model" not in name:
+            continue
+        shape_str = "(" + ", ".join([str(x) for x in tensor.shape]) + ")"
+        print(f"{name:<75} {shape_str:>20}")
+
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+    def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+        for i in range(tensor_parallel_size):
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+            # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+            extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+            is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+            if use_te and is_extra_state_layer:
+                layer = new_name.split(".")[-2]
+                if layer in extra_state_layers:
+                    extra_state_name = (
+                        new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                    )  # Replace the weight name.
+                    new_state_dicts[i]["model"][extra_state_name] = None
+
+    for name, tensor in state_dict.items():
+        if tensor.dtype == torch.float16:
+            state_dict[name] = tensor.to(torch.float32)
+
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"],
+        "position_embeddings.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"],
+        "conv1.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"],
+        "conv1.bias")
+
+    head_dim = 72
+    num_head = 16
+    for layer_idx in range(27):
+        origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}"
+        target_base = f"decoder.layers.{layer_idx}"
+
+        for param_type in ["weight", "bias"]:
+            # QKV
+            q_proj_params = state_dict[f"{origin_base}.self_attn.q_proj.{param_type}"]
+            k_proj_params = state_dict[f"{origin_base}.self_attn.k_proj.{param_type}"]
+            v_proj_params = state_dict[f"{origin_base}.self_attn.v_proj.{param_type}"]
+            # Do some tensor manipulation because megatron expect one tensor
+            # projection for the QKV in the order
+            # [(Q1, K1, V1), (Q2, K2, V2), ...] where Qi is the query of the
+            # i-th head with dimension num_head.
+            new_tensor = torch.concatenate([
+                q_proj_params.view(num_head, head_dim, -1),
+                k_proj_params.view(num_head, head_dim, -1),
+                v_proj_params.view(num_head, head_dim, -1)], axis=1).view(
+                    3*head_dim*num_head, -1)
+            if param_type == "bias":
+                new_tensor = new_tensor[:, 0]
+            new_name = f"{target_base}.self_attention.linear_qkv.{param_type}"
+            add_chunck_tensor(new_tensor, new_name, chunk_dim=0)
+            # linear_proj
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.self_attn.out_proj.{param_type}"],
+                f"{target_base}.self_attention.linear_proj.{param_type}",
+                chunk_dim=1 if param_type == "weight" else None)
+            # layer_norm
+            new_name = f"{target_base}.input_layernorm.{param_type}"
+            if use_te:
+                new_name = f"{target_base}.self_attention.linear_qkv.layer_norm_{param_type}"
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.layer_norm1.{param_type}"],
+                new_name)
+            # FC 1
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.mlp.fc1.{param_type}"],
+                f"{target_base}.mlp.linear_fc1.{param_type}",
+                chunk_dim=0)
+            # FC 2
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.mlp.fc2.{param_type}"],
+                f"{target_base}.mlp.linear_fc2.{param_type}",
+                chunk_dim=1 if param_type=="weight" else None)
+            # layer_norm
+            new_name = f"{target_base}.pre_mlp_layernorm.{param_type}"
+            if use_te:
+                new_name = f"{target_base}.mlp.linear_fc1.layer_norm_{param_type}"
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.layer_norm2.{param_type}"],
+                new_name)
+
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.post_layernorm.weight"],
+        "ln_post.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.post_layernorm.bias"],
+        "ln_post.bias")
+
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
+        os.makedirs(output_dir_tp)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert SigLIP weights to megatron format.
+
+
+Example usage:
+python siglip_converter.py --tensor-parallel-size 4 --output google_paligemma_3b_pt_44_mcore_tp_4 --use-te
+
+examples/multimodal/combine_mistral_clip.sh Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
+    )
+    parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
+
+    args = parser.parse_args()
+
+    convert(args.output, args.tensor_parallel_size, args.use_te)
+
+    print("done.")