“13.0”

1106877d · jerrrrry · 1106877d · 1106877d · 1106877d · 1106877d
Commit 1106877d authored Sep 23, 2025 by jerrrrry
20 changed files
--- a/Megatron-LM/examples/multimodal/assets/pretrain_curves.png
+++ b/Megatron-LM/examples/multimodal/assets/pretrain_curves.png
--- a/Megatron-LM/examples/multimodal/combine_lm_vision_checkpoints.sh
+++ b/Megatron-LM/examples/multimodal/combine_lm_vision_checkpoints.sh
+#/bin/bash
+MCORE_LM=$1    # <path_to_mcore_lm_model_folder>
+MCORE_VISION=$2   # <path_to_mcore_vision_model_folder>
+OUTPUT_DIR=$3   # <path_to_output_folder_for_combined_checkpoint>
+MODEL_TYPE=$4   # Model type. Default: Mistral CLIP example.
+if [[ $MODEL_TYPE == "nvlm" ]]; then
+    # NVLM TP=8
+    python examples/multimodal/combine_state_dicts.py \
+        --input \
+        ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_07/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_07/model_optim_rng.pt \
+        --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+        --output \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_07/model_optim_rng.pt
+else
+    # Mistral CLIP example TP=4.
+    python examples/multimodal/combine_state_dicts.py \
+        --input \
+        ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+        --output \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt
+fi
+echo 1 > ${OUTPUT_DIR}/latest_checkpointed_iteration.txt
--- a/Megatron-LM/examples/multimodal/combine_state_dicts.py
+++ b/Megatron-LM/examples/multimodal/combine_state_dicts.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import argparse
+import os
+import sys
+import torch
+# Add megatron to the path.
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+def combine(input_files, module_prefixes, output_files):
+    num_inputs_per_output = int(len(input_files) / len(output_files))
+    for output_idx, output_file in enumerate(output_files):
+        combined_state_dict = None
+        lb = output_idx * num_inputs_per_output
+        ub = (output_idx + 1) * num_inputs_per_output
+        current_input_files = input_files[lb:ub]
+        current_module_prefixes = module_prefixes[lb:ub]
+        for i, (input_file, module_prefix) in enumerate(
+            zip(current_input_files, current_module_prefixes)
+        ):
+            # initialize the combined state dict using the first provided input file
+            current_state_dict = torch.load(input_file)
+            if i == 0:
+                combined_state_dict = current_state_dict.copy()
+                combined_state_dict["model"] = dict()
+            # copy model state dict and prefix names with the given module keys.
+            for k, v in current_state_dict["model"].items():
+                combined_state_dict["model"]["%s.%s" % (module_prefix, k)] = v
+        output_dir = os.path.dirname(output_file)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+        torch.save(combined_state_dict, output_file)
+        print("saved:", output_file)
+    print("done.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+        Combine multiple state dicts into a single state dict.
+        The combined state dict is first initialized by taking a copy of the first provided input state dict.
+        To avoid conflicts in model parameter names, a prefix must be provided for each input file.
+        Model parameter names will be renamed from <original name> to <model prefix>.<original name>.
+        Example usage:
+        python combine_state_dicts.py --input language_model.pt vision_model.pt --prefixes language_model vision_model --output multimodal.pt
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--input", nargs="*", required=True, help="paths to input state dict files")
+    parser.add_argument(
+        "--prefixes",
+        nargs="*",
+        required=True,
+        help="prefixes to use with each input model's parameters",
+    )
+    parser.add_argument(
+        "--output", nargs="*", required=True, help="path(s) to output state dict file"
+    )
+    args = parser.parse_args()
+    assert len(args.input) > 1, "must provide more than 1 input model to combine"
+    assert len(args.input) == len(args.prefixes), "each input model must have a corresponding key"
+    assert (
+        len(args.input) % len(args.output) == 0
+    ), "each output file must use the same number of input files"
+    combine(args.input, args.prefixes, args.output)
--- a/Megatron-LM/examples/multimodal/config.py
+++ b/Megatron-LM/examples/multimodal/config.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from dataclasses import dataclass
+import torch
+from megatron.training.activations import fast_gelu, quick_gelu, squared_relu
+def get_language_model_config(config):
+    if config.language_model_type == "llama3_8b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "llama3.1_8b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "llama3.1_70B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 28672
+    elif config.language_model_type == "mistral_7b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "nemotron5-8b":
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = False
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.activation_func = squared_relu
+        config.ffn_hidden_size = 21504
+        config.masked_softmax_fusion = True
+        config.attention_softmax_in_fp32 = True
+    elif config.language_model_type == "yi-34b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 20480
+    elif config.language_model_type == "qwen2.0_72B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.add_qkv_bias = True
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 29568
+    elif config.language_model_type == "qwen2.5_7B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.add_qkv_bias = True
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 18944
+    elif config.language_model_type == "qwen2.5_72B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.add_qkv_bias = True
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 29568
+    elif config.language_model_type == "nemotron5-hybrid-8b":
+        config.activation_func = squared_relu
+        config.squared_relu = True
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.apply_query_key_layer_scaling = False
+        config.gated_linear_unit = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 21504
+    elif config.language_model_type == "nemotron5-hybrid-56b":
+        config.activation_func = squared_relu
+        config.squared_relu = True
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.apply_query_key_layer_scaling = False
+        config.gated_linear_unit = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 32768
+        config.mamba_state_dim = 256
+    elif config.language_model_type == "llama3.2_1b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 8192
+    elif config.language_model_type.startswith("hf://"):
+        # Loaded from HuggingFace config file.
+        import transformers
+        hf_config = transformers.AutoConfig.from_pretrained(config.language_model_type.split("hf://")[1])
+        config.hf_config = hf_config
+        config.hidden_size = hf_config.hidden_size
+    else:
+        raise ValueError(f"unknown language model type {config.language_model_type}")
+    return config
+def get_vision_model_config(config, apply_query_key_layer_scaling):
+    if config.vision_model_type == "clip":
+        config.num_layers = 24
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1024
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4096
+        config.gated_linear_unit = False
+        config.activation_func = quick_gelu
+        config.kv_channels = 64
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+    elif config.vision_model_type == "siglip":
+        config.num_layers = 27
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1152
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4304
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 72
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
+    elif config.vision_model_type == "internvit":
+        config.num_layers = 45
+        config.num_attention_heads = ((24 // config.tensor_model_parallel_size) + 1) * config.tensor_model_parallel_size
+        config.num_query_groups = config.num_attention_heads
+        config.add_bias_linear = True
+        config.add_qkv_bias = False
+        config.hidden_size = 3200
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 12800
+        config.gated_linear_unit = False
+        config.activation_func = torch.nn.functional.gelu
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'RMSNorm'
+        config.layernorm_epsilon = 1e-6
+        config.apply_rope_fusion = False
+    elif config.vision_model_type == "internvit300M":
+        config.num_layers = 24
+        config.num_attention_heads = 16
+        config.num_query_groups = config.num_attention_heads
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1024
+        config.kv_channels = 64
+        config.hidden_dropout = 0.0
+        config.ffn_hidden_size = 4096
+        config.gated_linear_unit = False
+        config.activation_func = torch.nn.functional.gelu
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.layernorm_epsilon = 1e-6
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+    elif config.vision_model_type == "radio":
+        config.num_layers = 32
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1280
+        config.ffn_hidden_size = 5120
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 80
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
+    elif config.vision_model_type == "radio-g":
+        config.num_layers = 40
+        config.num_attention_heads = 24
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1536
+        config.ffn_hidden_size = 4096
+        config.gated_linear_unit = True
+        config.activation_func = torch.nn.functional.silu
+        config.kv_channels = 64
+        config.num_query_groups = 24
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
+    elif config.vision_model_type == "cradio-g":
+        config.num_layers = 40
+        config.num_attention_heads = 24
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1536
+        config.ffn_hidden_size = 6144
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 64
+        config.num_query_groups = 24
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
+    elif config.vision_model_type.startswith("hf://"):
+        import transformers
+        hf_config = transformers.AutoConfig.from_pretrained(config.vision_model_type.split("hf://")[1])
+        config.hf_config = hf_config
+        config.hidden_size = hf_config.hidden_size
+    else:
+        raise ValueError(f"unknown vision model type {config.vision_model_type}")
+    return config
+def get_vision_projection_config(config, hidden_size):
+    config.gated_linear_unit = False
+    config.bias_activation_fusion = False
+    config.add_bias_linear = False
+    config.hidden_size = hidden_size  # Used as the vision projection output size, i.e., the input to the language model.
+    if config.language_model_type == "llama3_8b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "llama3.1_8b":
+        config.ffn_hidden_size = 4096
+        config.activation_func = torch.nn.functional.gelu
+        config.layernorm_epsilon = 1e-5
+        config.add_bias_linear = True
+        config.normalization = "LayerNorm"
+    elif config.language_model_type == "mistral_7b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.gelu
+        config.normalization = None
+    elif config.language_model_type == "yi-34b":
+        config.ffn_hidden_size = 20480
+        config.normalization = "LayerNorm"
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.0_72B":
+        config.ffn_hidden_size = 29568
+        config.normalization = "LayerNorm"
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.5_7B":
+        config.ffn_hidden_size = 3584
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.5_72B":
+        config.ffn_hidden_size = 29568
+        config.normalization = "LayerNorm"
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "nemotron5-hybrid-56b":
+        config.ffn_hidden_size = 32768
+        config.activation_func = squared_relu
+    elif config.language_model_type in ("nemotron5-8b", "nemotron5-hybrid-8b"):
+        config.ffn_hidden_size = 21504
+        config.activation_func = squared_relu
+    elif config.language_model_type == "llama3.2_1b":
+        config.ffn_hidden_size = 2048
+        config.activation_func = torch.nn.functional.gelu
+        config.normalization = "LayerNorm"
+    elif config.language_model_type.startswith("hf://"):
+        config.activation_func = torch.nn.functional.gelu
+        config.ffn_hidden_size = 4096
+        config.normalization = "LayerNorm"
+    else:
+        raise ValueError(f"unknown language model type {config.language_model_type}")
+    return config
+@dataclass
+class EvaluationConfig:
+    """Evaluation related configuration."""
+    task: str
+    dataset: str = ""
+    temperature: float = 1.0
+    top_p: float = 0.0
+    top_k: int = 0
+    out_seq_length: int = 32
+    output_path: str = ""
+    input_image_path: str = ""
+    gt_path: str = ""
+    split: str = "validation"
+    num_partitions: int = 0
+    partition_id: int = 0
+    num_samples_per_partition: int = 0
--- a/Megatron-LM/examples/multimodal/convert_llava_pretrain_to_wds.py
+++ b/Megatron-LM/examples/multimodal/convert_llava_pretrain_to_wds.py
+import json
+import os
+import webdataset as wds
+from tqdm import tqdm
+llava_pretrain_dir = '<path_to_LLaVA-Pretrain>'
+# Paths to the dataset files
+json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json')
+output = os.path.join(llava_pretrain_dir, 'wds')
+if not os.path.exists(output):
+    os.mkdir(output)
+# Load data
+with open(json_file, 'r') as f:
+    data = json.load(f)
+with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer:
+    for entry in tqdm(data):
+        with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file:
+                image_data = img_file.read()
+        sample = {
+            "__key__": entry['id'],
+            "jpg": image_data,
+            "json": json.dumps(entry['conversations']).encode("utf-8"),
+        }
+        shard_writer.write(sample)
+print(f"Dataset successfully converted to wds")
--- a/Megatron-LM/examples/multimodal/dataloader_provider.py
+++ b/Megatron-LM/examples/multimodal/dataloader_provider.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import os
+import torch
+from dataset_helpers import TaskEncoder, print_error_handler
+from megatron.core import parallel_state
+from megatron.energon import (
+    LimitDataset,
+    RepeatDataset,
+    WorkerConfig,
+    get_loader,
+    get_savable_loader,
+    get_train_dataset,
+    get_val_datasets,
+)
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.core.parallel_state import get_tensor_model_parallel_rank, get_pipeline_model_parallel_world_size, get_pipeline_model_parallel_rank
+from megatron.training import get_args
+from megatron.training.checkpointing import get_checkpoint_name
+def datasets_provider(worker_config=None):
+    """Create multimodal train, validation and test datasets."""
+    args = get_args()
+    dname = args.data_path[0] if type(args.data_path) is list else args.data_path
+    train_dataset = get_train_dataset(
+        dname,
+        batch_size=args.micro_batch_size,
+        task_encoder=TaskEncoder(),
+        virtual_epoch_length=1000,
+        max_samples_per_sequence=100,
+        shuffle_buffer_size=100,
+        worker_config=worker_config,
+        packing_buffer_size=args.packing_buffer_size,
+        handler=print_error_handler,
+        image_decode="pil",
+    )
+    val_datasets = get_val_datasets(
+        dname,
+        batch_size=args.micro_batch_size,
+        # This is the total number over all workers
+        # limit=args.eval_iters * get_num_microbatches(),
+        task_encoder=TaskEncoder(),
+        worker_config=worker_config,
+        packing_buffer_size=args.packing_buffer_size,
+        handler=print_error_handler,
+        image_decode="pil",
+    )
+    val_datasets_without_source_datasets = [
+        # Limit the dataset to eval_iters * num_microbatches
+        LimitDataset(
+            # Repeat the inner dataset in case it's too short
+            RepeatDataset(val_ds, worker_config=worker_config),
+            length=args.eval_iters * get_num_microbatches(),
+            worker_config=worker_config,
+            reset_after_epoch=True,
+        )
+        for val_ds, _src_ds in val_datasets
+    ]
+    return train_dataset, val_datasets_without_source_datasets, None
+def is_first_or_last_stage(pp_size, encoder_pipeline_model_parallel_size):
+    """Check if the current pipeline parallel stage is the first or last stage."""
+    if pp_size == 1:    # No pipeline parallelism.
+        return True
+    is_valid_rank = False
+    pp_rank = get_pipeline_model_parallel_rank()
+    if encoder_pipeline_model_parallel_size == 0:
+        # No separate pipeline stage for the vision model. Run the dataloader on the first and last pipeline stage.
+        is_valid_rank = pp_rank in (0, pp_size-1)
+    elif encoder_pipeline_model_parallel_size == 1:
+        # Separate pipeline stage for the vision model. Run the dataloader on the first vision and LM stage and last LM stage.
+        is_valid_rank = pp_rank in (0, 1, pp_size-1)
+    else:
+        raise NotImplementedError("encoder-pipeline-model-parallel-size > 1 is not supported yet")
+    return is_valid_rank
+def is_dataloader_rank(encoder_pipeline_model_parallel_size):
+    """Check if we should have the dataloader on this tensor and pipeline parallel rank."""
+    # Run dataloader only on the first tensor parallel rank (will be broadcasted to others).
+    is_first_rank = get_tensor_model_parallel_rank() == 0
+    pp_size = get_pipeline_model_parallel_world_size()
+    is_first_rank = is_first_rank and is_first_or_last_stage(pp_size, encoder_pipeline_model_parallel_size)
+    return is_first_rank
+def train_valid_test_dataloaders_provider(train_val_test_num_samples):
+    """Build multimodal train, validation and test dataloaders."""
+    args = get_args()
+    # Dataloader is only on specific ranks.
+    if not is_dataloader_rank(args.encoder_pipeline_model_parallel_size):
+        return None, None, None
+    worker_debug_path = None
+    worker_log_level = 0
+    rank = parallel_state.get_data_parallel_rank()
+    world_size = parallel_state.get_data_parallel_world_size()
+    data_parallel_group = parallel_state.get_data_parallel_group()
+    worker_config = WorkerConfig(
+        rank=rank,
+        world_size=world_size,
+        num_workers=args.num_workers,
+        data_parallel_group=data_parallel_group,
+        worker_debug_path=worker_debug_path,
+        worker_log_level=worker_log_level,
+    )
+    train_ds, valid_ds1, test_ds = datasets_provider(worker_config)
+    train_dataloader = get_savable_loader(train_ds, worker_config=worker_config)
+    if args.load is not None:
+        if getattr(args, "dataloader_save", None):
+            dp_rank = parallel_state.get_data_parallel_rank()
+            data_save_name = get_checkpoint_name(
+                args.dataloader_save,
+                args.iteration,
+                pipeline_rank=0,    # Only the first pipeline parallel rank stores the dataloader checkpoint.
+                basename=f"train_dataloader_dprank{dp_rank:03d}.pt",
+            )
+            if os.path.exists(data_save_name):
+                try:
+                    dataset_state_dict = torch.load(data_save_name, map_location="cpu")
+                    train_dataloader.restore_state_rank(dataset_state_dict["dataloader_state_dict"])
+                    print(f"restored dataset state from {data_save_name}")
+                except Exception as e:
+                    print("loading dataset state failed. Skipping. " + str(e))
+            else:
+                print(f"dataset state {data_save_name} does not exist")
+    valid_dataloader = [
+        EnergonDataloader(get_loader(valid_ds, worker_config=worker_config))
+        for valid_ds in valid_ds1
+    ]
+    test_dataloader = None
+    return EnergonDataloader(train_dataloader), valid_dataloader, EnergonDataloader(test_dataloader)
+class EnergonDataloader:
+    """A wrapper to use Megatron Energon dataloader with the Megatron-LM training loop."""
+    def __init__(self, dataloader):
+        self._dataloader = dataloader
+        self._iter = iter(cyclic_iter(dataloader))
+    def __next__(self):
+        return self._iter.__next__()
+    def __iter__(self):
+        return self._iter.__iter__()
+    def save_state(self):
+        return self._dataloader.save_state_rank()
+def cyclic_iter(iter):
+    while True:
+        for x in iter:
+            yield x
--- a/Megatron-LM/examples/multimodal/dataset_helpers.py
+++ b/Megatron-LM/examples/multimodal/dataset_helpers.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import bisect
+import dataclasses
+import json
+import re
+import sys
+import traceback
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+from image_processing import ImageTransform, find_closest_aspect_ratio, find_closest_area_weighted_aspect_ratio
+from PIL import Image
+from torchvision.transforms import ToPILImage
+import numpy as np
+import torch
+from energon_util import OfflineTargetAspectRatioSample, SampleListSample
+from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN, VIDEO_TOKEN
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.energon import (
+    Batch,
+    CaptioningSample,
+    DefaultTaskEncoder,
+    OCRSample,
+    Sample,
+    SimilarityInterleavedSample,
+    VQASample,
+    MultiChoiceVQASample
+)
+from megatron.energon.task_encoder.base import stateless
+from megatron.training import get_args, get_tokenizer
+@dataclass
+class ImageTaskSample(Sample):
+    __key__: str
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict
+    __subflavors__: Dict
+    # (c, h, w)
+    imgs: List[torch.Tensor]
+    num_tiles: List[int]
+    tokens: torch.Tensor
+    total_len: int  # Total token count in the sample, including text and image tokens
+    labels: torch.Tensor = None
+@dataclass
+class ImageTaskSamplePacked(Sample):
+    """Dataclass to store a single packed sample (not a batch).
+        P = Number of sub-samples in the packed sample
+        seq_len = Total sequence length
+        num_imgs = Number of images across all samples in the packed sample
+    """
+    __key__: str    # Sample name
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict     # Sample metadata. Deprecated.
+    __subflavors__: Dict    # Sample metadata.
+    tokens: torch.Tensor  # Input tokens packed into a single tensor (seq_len,)
+    labels: torch.Tensor # Target tokens packed into a single tensor (seq_len,)
+    imgs: List[torch.Tensor]    # Input images
+    num_tiles: List[int]  # Number of tiles for each image of each sample (num_imgs)
+    max_length: int    # Maximum length across sub-samples.
+    cu_lengths: List[int]  # Cumulative length of each sub-sample in this packed sample incl. text and image tokens (P,)
+# Typing for the resulting batch data after encode_batch()
+@dataclass
+class ImageTaskBatchPacked(Batch):
+    """Dataclass to store a batch of packed samples.
+        N = Batch size
+        P = Number of samples in the packed sample
+        seq_len = Maximum sequence length
+        num_imgs = Number of images across all samples in the packed sample
+    """
+    __key__: List[str]  # Sample names
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict     # Sample metadata. Deprecated.
+    __subflavors__: List[Dict]  # Sample metadatas.
+    tokens: torch.Tensor  # Input tokens packed and padded (N, seq_len)
+    labels: torch.Tensor # Target tokens packed and padded (N, seq_len)
+    imgs: torch.Tensor  # All image tiles stacked into a single tensor (num_tiles, C, H, W)
+    num_tiles: List[List[int]]  # Number of tiles per image (N, num_imgs)
+    max_lengths: List[int]  # Maximum length across sub-samples (N,)
+    cu_lengths: List[List[int]]  # Cumulative length of each sub-sample in each packed sample of the batch (N, P)
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L19
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def search_for_fit(numbers: List[int], capacity: int) -> int:
+    """Finds the index of largest number that fits into the knapsack with the given capacity."""
+    index = bisect.bisect(numbers, capacity)
+    return -1 if index == 0 else (index - 1)
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L27
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: int) -> List:
+    """Greedy algorithm with binary search for the knapsack problem.
+    Pack as many samples as possible given a maximum capacity and capacities of individual samples.
+    Used if sequence packing is enabled.
+    """
+    assert len(item_sizes) == len(samples), "sample lengths and samples must have the same length."
+    knapsacks = []
+    if len(item_sizes) == 0:
+        return knapsacks
+    # Sort sample lengths and samples together.
+    sorted_item_sizes, sorted_samples = zip(*sorted(zip(item_sizes, samples), key=lambda x: x[0]))
+    sorted_item_sizes = list(sorted_item_sizes)
+    sorted_samples = list(sorted_samples)
+    # Check if all samples fit in the knapsack capacity.
+    if sorted_item_sizes[-1] > max_capacity:
+        raise ValueError(f"knapsack: A sample is larger {sorted_item_sizes[-1]} than the max_sequence_length {max_capacity}.")
+    while sorted_item_sizes:
+        current_knapsack = []
+        remaining_capacity = max_capacity
+        while True:
+            idx = search_for_fit(sorted_item_sizes, remaining_capacity)
+            if idx == -1:
+                break   # Can't fit more samples.
+            remaining_capacity -= sorted_item_sizes[idx]
+            sorted_item_sizes.pop(idx)
+            sample = sorted_samples.pop(idx)
+            current_knapsack.append(sample)
+        knapsacks.append(current_knapsack)
+    return knapsacks
+class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, dict]):
+    """A simple task encoder for VLMs."""
+    def __init__(
+        self
+    ):
+        super().__init__()
+        self.args = get_args()
+        self.tokenizer = get_tokenizer()
+        with open(self.args.prompt_path, "r") as f:
+            self.manual_prompts = json.load(f)
+        self.dataloader_seq_length = self.args.dataloader_seq_length  # Always return samples of this length.
+        self.packing_seq_length = self.args.packing_seq_length     # Packing sequence length, if packing is enabled.
+        self.is_packing_enabled = self.args.packing_buffer_size is not None and self.args.packing_buffer_size > 0
+        if self.dataloader_seq_length and self.packing_seq_length:
+            assert self.dataloader_seq_length >= self.packing_seq_length, "dataloader sequence length must be greater than or equal to the packing sequence length"
+        if self.is_packing_enabled:
+            assert self.packing_seq_length > 0, "packing sequence length must be set"
+        self.num_image_embeddings_per_tile = get_num_image_embeddings(
+            self.args.img_h,
+            self.args.img_w,
+            self.args.patch_dim,
+            self.args.vision_model_type,
+            self.args.disable_vision_class_token,
+            1,
+            self.args.pixel_shuffle,
+            self.args.use_tile_tags,
+            self.args.max_num_tiles,
+            self.args.tokenizer_prompt_format,
+        )
+        self.txt_to_token_dict = {}
+        self.img_h, self.img_w = self.args.img_h, self.args.img_w
+        self.img_token_id = self.tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        # This map is used to reduce the number of tiles used per image if the number of tokens is
+        # larger than the decoder_seq_length.
+        self.num_tiles_degradation_map = {12:8, 8:6, 6:4, 4:2, 2:1, 1:1}
+        self.find_closest_aspect_ratio_fn = (
+            find_closest_area_weighted_aspect_ratio if self.args.use_area_weighted_aspect_ratio
+            else find_closest_aspect_ratio)
+        self.transform_img = ImageTransform(self.img_h, self.args.vision_model_type)
+    def _get_total_seq_length(self, input_ids, num_tiles):
+        """Calculate expected sequence length given text tokens length and number of tiles."""
+        total_num_images = len(num_tiles)
+        total_num_tiles = sum(num_tiles)
+        total_len = len(input_ids) + total_num_tiles * self.num_image_embeddings_per_tile - total_num_images
+        return total_len
+    def _truncate_for_packing(self, input_ids, target, num_tiles):
+        """Truncate tokens and labels if they exceed packing sequence length."""
+        total_num_images = len(num_tiles)
+        total_num_tiles = sum(num_tiles)
+        total_img_embeddings_len = total_num_tiles * self.num_image_embeddings_per_tile
+        max_text_tokens = self.packing_seq_length - total_img_embeddings_len + total_num_images
+        input_ids = input_ids[:max_text_tokens]
+        target = target[:max_text_tokens]
+        # If truncate causes all labels to be ignored, then skip the sample
+        if (target == IGNORE_INDEX).all():
+            raise ValueError(f"all targets will be ignored after truncation: {input_ids}")
+        return input_ids, target
+    @stateless(restore_seeds=True)
+    def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQASample, SimilarityInterleavedSample]):
+        if isinstance(sample, OCRSample):
+            if "pdfa" in sample.__key__:
+                yield self.combined_ocr_encoder(sample, task_type='encode_pdf')
+            elif "multi" in sample.__key__:
+                yield self.combined_ocr_encoder(sample, task_type='_encode_ocr')
+            else:
+                yield self.combined_ocr_encoder(sample, task_type='encode_ocr_ref')
+        elif isinstance(sample, CaptioningSample):
+            yield self.encode_captioning(sample)
+        elif isinstance(sample, VQASample):
+            is_llava_training = sample.__subflavors__["is_llava_training"] if "is_llava_training" in sample.__subflavors__ else False
+            if "llava" in sample.__key__ or is_llava_training:
+                yield self.encode_llava_pretrain(sample)
+            else:
+                yield self.encode_any_single_turn_vqa(sample)
+        elif isinstance(sample, SimilarityInterleavedSample):
+            yield self.encode_llava_sft(sample)
+        elif isinstance(sample, MultiChoiceVQASample):
+            yield self.encode_any_single_turn_vqa(sample)
+        # Because the SampleListSample is defined in the Megatron module but loaded by the Energon
+        # library, we need to resort to the more brittle check:
+        elif type(sample).__name__ == "SampleListSample":
+            yield self.encode_sample_list(sample)
+        else:
+            raise NotImplementedError("Sample format not supported", sample)
+    def encode_captioning(self, sample: CaptioningSample):
+        """Encode CaptioningSample."""
+        augment = sample.__subflavors__.get("augmentation")
+        imgs = self.transform_img(
+            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+            find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+        )
+        num_tiles = [len(imgs)]
+        prompt_list = self.manual_prompts["CaptioningPretraining"]["raw"]
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt + "\n"
+        caption = sample.caption.strip()
+        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
+        if split_by_line_flag:
+            caption_list = caption.split('\n')
+            caption = np.random.choice(caption_list)
+        conv = [
+            # Note: no system message.
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": caption},
+        ]
+        input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+    def encode_llava_pretrain(self, sample: VQASample):
+        """Encode pretrain sample in LLAVA style."""
+        augment = sample.__subflavors__.get("augmentation", False)
+        imgs = self.transform_img(
+            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+            find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+        )
+        num_tiles = [len(imgs)]
+        # LLAVA training: override text-prompt with just the image.
+        conv = [
+            # Note: no system message.
+            {"role": "user", "content": IMAGE_TOKEN + "\n"},
+            {"role": "assistant", "content": sample.answers},
+        ]
+        input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+    def encode_sample_list(self, samples: SampleListSample):
+        """We encode the list of samples using encode_llava_sft on each sample."""
+        error_msg = ("You probably don't want to use online packing since SampleListSample is "
+                     "usually used along offline packing.")
+        assert not self.is_packing_enabled, error_msg
+        encoded_samples = []
+        current_length = 0
+        for idx, sample in enumerate(samples.samples):
+            try:
+                encoded_sample = self.encode_llava_sft(sample, truncate_for_sample_list_packing=True)
+                if current_length + encoded_sample.total_len > self.packing_seq_length:
+                    print(f"Encoding list of samples: stopped at {idx+1} samples to stick to {self.packing_seq_length}. Last sample key: {sample.__key__}")
+                    break
+                else:
+                    encoded_samples.append(encoded_sample)
+                    current_length += encoded_sample.total_len
+            except Exception as e:
+                print(e)
+        return self.pack_selected_samples(encoded_samples)
+    def encode_llava_sft(self, sample: Union[SimilarityInterleavedSample, OfflineTargetAspectRatioSample], truncate_for_sample_list_packing=False):
+        """Encode SFT sample."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
+        # If the target aspect ratio are provided by the dataset, we use them instead of computing
+        # them with the self.find_closest_aspect_ratio_fn function.
+        local_find_closest_aspect_ratio_fn = self.find_closest_aspect_ratio_fn
+        if type(sample).__name__ == "OfflineTargetAspectRatioSample" and len(sample.target_aspect_ratio) > 0:
+            target_aspect_ratio = tuple(sample.target_aspect_ratio[0])
+            assert target_aspect_ratio is not None, "Sample of type OfflineTargetAspectRatioSample needs to define the target aspect ratio."
+            local_find_closest_aspect_ratio_fn = lambda *args, **kwargs: target_aspect_ratio
+        has_image = False
+        # We infer whether the sample has image or not.
+        if hasattr(sample, "images") and not has_video:
+            # If this is a text-only sample and we are freezing the LM,
+            # then use a dummy input image.
+            if len(sample.images) == 0 and self.args.freeze_LM:
+                empty_img = Image.new('RGB', (self.args.img_w, self.args.img_h), (255, 255, 255))
+                sample.images.append(empty_img)
+            if len(sample.images) > 0:
+                has_image = True
+        # Note: Some tokenizers may ignore the system prompt.
+        conversation = [{"role": "system", "content": "Answer the questions."}]
+        # Format the conversation as a list of "user" / "assistant" turns.
+        for text in sample.texts:
+            error_msg = f"unexpected role {text['from']} in {sample.texts}"
+            assert text["from"] in ["human", "gpt"], error_msg
+            conversation.append({
+                "role": "user" if text["from"] == "human" else "assistant",
+                "content": text["value"]})
+        # Replace the image tags <image-idx> with IMAGE_TOKEN and count the number of image tags
+        number_image_tags = 0
+        image_tag_ids_list = []
+        for turn in conversation:
+            if turn["role"] == "user":
+                image_tag_ids = [int(x) - 1 for x in re.findall(r"<image-(\d+)>", turn["content"])]
+                image_tag_ids_list.extend(image_tag_ids)
+                turn["content"] = re.sub(r"<image-\d+>", IMAGE_TOKEN, turn["content"])
+                # For videos, we use the image token to locate where to put the frames.
+                if has_video:
+                    turn["content"] = turn["content"].replace(VIDEO_TOKEN, IMAGE_TOKEN)
+                number_image_tags += turn["content"].count(IMAGE_TOKEN)
+        # We re-order the images in sample.images according to how they appear in the conversation.
+        if len(image_tag_ids_list) > 0:
+            sample.images = [sample.images[idx] for idx in image_tag_ids_list]
+        # If there is only one image, but several image tags, we assume all the tags refer to the
+        # same image and duplicate the image:
+        if not has_video and len(sample.images) == 1 and number_image_tags > 1:
+            sample.images = sample.images * number_image_tags
+        # If there are no images in the sample, remove the image tags in the conversation.
+        if len(sample.images) == 0:
+            for turn in conversation:
+                if turn["role"] == "user":
+                    turn["content"] = turn["content"].replace(IMAGE_TOKEN, "")
+            number_image_tags = 0   
+        # We currently only support one video per sample.
+        number_of_images = 1 if has_video else len(sample.images)
+        # Fail if there are more image or video tags than image or videos:
+        error_msg = (
+            f"Found {number_image_tags} image tags for {number_of_images} images. {sample.texts}")
+        assert number_image_tags <= number_of_images, error_msg
+        # If there are less image of video tags than image or videos, prepend the tags to the first
+        # user message:
+        if number_image_tags < number_of_images:
+            for turn in conversation:
+                if turn["role"] == "user":
+                    turn["content"] = IMAGE_TOKEN*(number_of_images-number_image_tags) + "\n" + turn["content"]
+                    break
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+        if has_image:
+            imgs = []
+            num_tiles = []
+            max_num_tiles = self.args.max_num_tiles
+            # We keep a buffer of 4 tokens for the question,
+            # the rest can be used for image tokens.
+            max_image_token_allowed = self.args.decoder_seq_length - len(input_ids) - 4
+            # We start by extracting as many tiles per image as possible, and decrease the max
+            # number of tiles if there are too many image tokens.
+            while True:
+                imgs = []
+                num_tiles = []
+                for img in sample.images:
+                    # This if block is a temporary fix to handle video frames. We hard code
+                    # `use_tiling = False` because we don't use tiling for videos frames to keep
+                    # the number of tokens to a reasonable value.
+                    if isinstance(img, torch.Tensor) or isinstance(img, np.ndarray):
+                        if len(img.shape) == 4:
+                            assert img.shape[0] == 1, f"When len(img.shape) == 4, we expect the first dimension to be 1, but got img.shape: {img.shape} instead."
+                            img = img[0]
+                            use_tiling = False
+                        to_pil = ToPILImage()
+                        img = to_pil(img)
+                    img_tiles = self.transform_img(
+                        img, self.img_h, self.img_w, self.args.use_tiling, max_num_tiles,
+                        self.args.use_thumbnail, augment, find_closest_aspect_ratio_fn=local_find_closest_aspect_ratio_fn)
+                    imgs += img_tiles
+                    num_tiles += [len(img_tiles)]
+                if max_num_tiles == 1:
+                    break
+                if sum(num_tiles) * self.num_image_embeddings_per_tile > max_image_token_allowed:
+                    if max_num_tiles in self.num_tiles_degradation_map:
+                        max_num_tiles = self.num_tiles_degradation_map[max_num_tiles]
+                    else:
+                        raise RuntimeError((
+                            f"Tried to decrease the number of tiles {max_num_tiles} but it's not ",
+                            f"defined in the degradation map {self.num_tiles_degradation_map}"))
+                else:
+                    break
+        elif has_video:
+            # We don't use tiling for videos to limit the number of tokens.
+            use_tiling=False
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, num_channels, height, width).
+            video_fchw = sample.images.frames
+            if video_fchw.shape[0] == 0:
+                raise ValueError(f"Video {sample.__key__} {sample.__restore_key__} {sample.texts} has no frames.")
+            selected_frames = torch.linspace(
+                0, video_fchw.shape[0] - 1,
+                min(self.args.num_frames, video_fchw.shape[0])).long()
+            video_fchw = video_fchw[selected_frames]
+            imgs = []
+            for video_chw in video_fchw:
+                to_pil = ToPILImage()
+                video_chw = to_pil(video_chw)
+                imgs += self.transform_img(
+                    video_chw, self.img_h, self.img_w, use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment, find_closest_aspect_ratio_fn=local_find_closest_aspect_ratio_fn)
+            num_tiles = [len(imgs)]
+        else:
+            imgs = num_tiles = []
+        if self.is_packing_enabled or truncate_for_sample_list_packing:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+        # Some final checks with respect to the number of image tokens and images on the tokenized
+        # conversation. There can still be errors, for instance if a non-video sample happens to
+        # have our pre-defined video token, or if the packing truncation removed a necessary image
+        # tag.
+        number_image_token = np.sum(input_ids == self.img_token_id)
+        error_msg = (
+            f"Found {number_image_token} image tokens for len({num_tiles}) = {len(num_tiles)} image tiles in {conversation}.")
+        assert number_image_token == len(num_tiles), error_msg
+        error_msg = (
+            f"Found sum({num_tiles}) = {np.sum(num_tiles)} tiles for {len(imgs)} images in {conversation}.")
+        assert np.sum(num_tiles) == len(imgs), error_msg
+        # We need to ensure that there are at least some trainable tokens in the sample.
+        assert self.target_has_trainable_tokens(input_ids, num_tiles, target), "Sample has no trainable tokens."
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+    def target_has_trainable_tokens(self, input_ids, num_tiles, target):
+        # Compute the loss mask based on extending the image tags with the proper
+        # number of image tokens, extracting the first self.args.decoder_seq_length tokens, and
+        # ensuring that some of these tokens have a loss mask > 0.
+        # Note that this is a bit hacky because we reproduce here parts of the logics which are in
+        # the model itself. Ideally, the data sampler would return the already processed inputs
+        # and targets to avoid this duplication.
+        expanded_target = target.copy()
+        expanded_target[input_ids==self.img_token_id] = self.img_token_id
+        expanded_target = self.replace_value_with_repetition(
+            expanded_target, self.img_token_id,
+            self.num_image_embeddings_per_tile * np.array(num_tiles), IGNORE_INDEX)
+        loss_mask = torch.ones(torch.tensor(expanded_target).size(), dtype=torch.float)
+        loss_mask[expanded_target == self.tokenizer.pad] = 0.0 # mask paddings
+        loss_mask[expanded_target == IGNORE_INDEX] = 0.0 # mask prompts
+        loss_mask = torch.cat((loss_mask[1:], torch.zeros((1,))))
+        loss_mask = loss_mask[:self.args.decoder_seq_length]
+        return torch.sum(loss_mask) > 0
+    def replace_value_with_repetition(self, arr, token_to_replace, num_repetition, new_token):
+        """
+        Replace every occurrence of value V in the input array with R repetitions of W.
+        Args:
+            arr (Array): Input array to be modified
+            token_to_replace: token to be replaced
+            new_token: new token
+            num_repetition (Array): number of repetition of new token.
+        Returns:
+            Array: New array with token_to_replace replaced by num_repetition repetitions of
+             new_token
+        """
+        error_msg = "The number of image tokens must match the length of the tile tensor."
+        assert np.sum(arr==token_to_replace) == len(num_repetition), error_msg
+        result = []
+        idx = 0
+        for item in arr:
+            if item == token_to_replace:
+                # If the current item matches token_to_replace, add R copies of W
+                result.extend([new_token] * num_repetition[idx])
+                idx += 1
+            else:
+                # Otherwise, keep the original item
+                result.append(item)
+        return np.array(result)
+    def encode_any_single_turn_vqa(self, sample):
+        """Encode MultiChoiceVQA or VQA sample."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
+        if has_video:
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, height, width, num_channels).
+            video_fhwc = sample.image.permute(0, 2, 3, 1)
+            selected_frames = torch.linspace(
+                0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
+            video_frame_fhwc = video_fhwc[selected_frames]
+            imgs = []
+            for video_frame_hwc in video_frame_fhwc:
+                imgs += self.transform_img(
+                    video_frame_hwc, self.img_h, self.img_w,
+                    self.args.use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment, find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+                )
+        else:
+            imgs = self.transform_img(
+                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
+                self.args.use_thumbnail, augment, find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+            )
+        num_tiles = [len(imgs)]
+        if isinstance(sample, MultiChoiceVQASample):
+            cur_prompt = format_multichoice_question(sample.context, sample.choices)
+            if IMAGE_TOKEN not in cur_prompt:
+                cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+            cur_answer = format_multichoice_answer(sample.correct_choice_idx)
+        elif isinstance(sample, VQASample):
+            if 'docvqa' in sample.__key__:
+                prompt_list = self.manual_prompts["VQASFT"]["docvqa"]
+            elif sample.__subflavors__.get("VQASFT"):
+                prompt_list = self.manual_prompts["VQASFT"]["raw"]
+            else:
+                prompt_list = ["{}"]
+            prompt_idx = np.random.randint(len(prompt_list))
+            cur_prompt = prompt_list[prompt_idx]
+            cur_prompt = cur_prompt.format(sample.context)
+            if IMAGE_TOKEN not in cur_prompt:
+                cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+            if isinstance(sample.answers, list):
+                answer_list = sample.answers
+                weight_list = np.array(sample.answer_weights).astype(np.float32)
+                weight_list = weight_list / np.sum(weight_list)
+                answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0]
+                cur_answer = answer_list[answer_idx]
+            else:
+                cur_answer = sample.answers
+        else:
+            raise NotImplementedError("Unsupported data type provided", sample)
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": str(cur_answer)},
+        ]
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+    def combined_ocr_encoder(self, sample, task_type):
+        """Encode OCR samples."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        if task_type == "encode_pdf":
+            sample, cur_prompt, cur_answer = self.encode_pdf_prompt(sample)
+        elif task_type == "encode_ocr_ref":
+            sample, cur_prompt, cur_answer = self.encode_ocr_ref_prompt(sample)
+        elif task_type == "_encode_ocr":
+            sample, cur_prompt, cur_answer = self.encode_ocr_prompt(sample)
+        imgs = self.transform_img(
+                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
+                self.args.use_thumbnail, augment, find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+            )
+        num_tiles = [len(imgs)]
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": str(cur_answer)},
+        ]
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+    def encode_pdf_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        prompt_list = self.manual_prompts["DocPretraining"]["raw"]
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+        # Make sure there is no extra IMAGE_TOKEN tag.
+        sample.text = sample.text.replace(IMAGE_TOKEN, "")
+        caption = sample.text.strip()
+        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
+        if split_by_line_flag:
+            caption_list = caption.split('\n')
+            caption = np.random.choice(caption_list)
+        cur_answer = caption
+        return sample, cur_prompt, cur_answer
+    def encode_ocr_ref_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        ref = sample.text
+        region = sample.words_boxes
+        # Make sure there is no extra IMAGE_TOKEN tag
+        ref = ref.replace(IMAGE_TOKEN, "")
+        if len(region) == 4:
+            region = f"<box>({region[0]},{region[1]}),({region[2]},{region[3]})</box>"
+        else:
+            region = f"<quad>({region[0]},{region[1]}),({region[2]},{region[3]}),({region[4]},{region[5]}),({region[6]},{region[7]})</quad>"
+        # Randomly choose between two tasks
+        task_idx = np.random.randint(2)
+        if task_idx == 0:
+            # Referring Grounding
+            prompt_list = self.manual_prompts["DocPretraining"]["referring_grounding"]
+            prompt_content = ref
+            answer = region
+        else:
+            # Grounded OCR
+            prompt_list = self.manual_prompts["DocPretraining"]["grounded_ocr"]
+            prompt_content = region
+            answer = ref
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        cur_prompt = cur_prompt.format(prompt_content)
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+        return sample, cur_prompt, answer
+    def bbox_coord_to_label(self, text, bbox):
+        """Format bbox coordinates as text."""
+        assert len(bbox) == 4 or len(bbox) == 8
+        # Make sure there is no extra IMAGE_TOKEN tag
+        text = text.replace(IMAGE_TOKEN, "")
+        if len(bbox) == 4:
+            label_str = f"<ref>{text}</ref><box>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]})</box>"
+        else:
+            label_str = f"<ref>{text}</ref><quad>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]}),({bbox[4]},{bbox[5]}),({bbox[6]},{bbox[7]})</quad>"
+        return label_str
+    def encode_ocr_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        if isinstance(sample.words_boxes[0], int):
+            answer = self.bbox_coord_to_label(sample.text, sample.words_boxes)
+        elif isinstance(sample.words_boxes[0], list):
+            answer = ""
+            for i, bbox in enumerate(sample.words_boxes):
+                answer += self.bbox_coord_to_label(sample.words_text[i], bbox)
+        prompt_list = self.manual_prompts["DocPretraining"]["ocr_multi"]
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+        cur_answer = answer
+        return sample, cur_prompt, cur_answer
+    def batch(self, samples: List[Union[ImageTaskSample, ImageTaskSamplePacked]]) -> ImageTaskBatchPacked:
+        # Stack images to [num_tiles, c, h, w]. If there are no images (text-only), then use a dummy image.
+        imgs = [img for s in samples for img in s.imgs]
+        if len(imgs) > 0:
+            imgs = torch.stack(imgs)
+        else:
+            imgs = torch.tensor([[0]], dtype=torch.float32)
+        # If the user hasn't defined a target dataloader sequence length, then use the max along the sample lengths.
+        max_seq_len = self.dataloader_seq_length
+        if not max_seq_len:
+           max_seq_len = max(len(s.tokens) for s in samples)
+        tokens = np.full((len(samples), max_seq_len), self.tokenizer.pad, dtype=np.int64)
+        # +1 to accommodate shift to left by one later.
+        labels = np.full((len(samples), max_seq_len + 1), self.tokenizer.pad, dtype=np.int64)
+        for i, s in enumerate(samples):
+            # If the sample/target length exceeds the target sequence length, then truncate.
+            text_len = min(max_seq_len, len(s.tokens))
+            target_len = min(max_seq_len+1, len(s.labels))
+            tokens[i, :text_len] = s.tokens[:text_len]
+            labels[i, :target_len] = s.labels[:target_len]
+        num_tiles = torch.tensor([n for s in samples for n in s.num_tiles], dtype=torch.int32)
+        if len(num_tiles) == 0:
+            num_tiles = torch.tensor([[0]], dtype=torch.int32)
+        # Cumulative sample lengths are needed for packing, otherwise use dummy values.
+        cu_lengths = torch.tensor([[0]], dtype=torch.int32)
+        max_lengths = torch.tensor([[0]], dtype=torch.int32)
+        if isinstance(samples[0], ImageTaskSamplePacked):
+            cu_lengths = torch.stack([s.cu_lengths for s in samples])
+            max_lengths = torch.tensor([s.max_length for s in samples], dtype=torch.int32)
+        return ImageTaskBatchPacked(
+            __key__=[s.__key__ for s in samples],
+            __restore_key__=[s.__restore_key__ for s in samples],
+            __subflavor__=None,
+            __subflavors__=samples[0].__subflavors__,
+            tokens=tokens,
+            labels=labels,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            cu_lengths=cu_lengths,
+            max_lengths=max_lengths,
+        )
+    def encode_batch(self, batch: ImageTaskBatchPacked) -> dict:
+        raw = dataclasses.asdict(batch)
+        del raw["__subflavors__"]
+        return raw
+    def select_samples_to_pack(self, samples: List[ImageTaskSample]) -> List[List[ImageTaskSample]]:
+        """Selects which samples will be packed together.
+        NOTE: Energon dataloader calls this method internally if packing is used.
+        Please see https://nvidia.github.io/Megatron-Energon/packing.html
+        """
+        lengths = [sample.total_len for sample in samples]
+        packed_samples = greedy_knapsack(lengths, samples, self.packing_seq_length)
+        return packed_samples
+    @stateless
+    def pack_selected_samples(self, samples: List[ImageTaskSample]) -> List[ImageTaskSamplePacked]:
+        """
+        Function to pack a list of ImageTaskSample into a single ImageTaskSamplePacked.
+        NOTE: Energon dataloader calls this method internally if packing is used.
+        Please see https://nvidia.github.io/Megatron-Energon/packing.html
+        Args:
+            samples: List of ImageTaskSample instances to pack into one sample.
+        Returns:
+            ImageTaskSamplePacked instance.
+        """
+        packing_seq_len = self.packing_seq_length
+        packed_tokens = []
+        packed_labels = []
+        packed_imgs = []
+        current_length = 0
+        max_length = 0
+        cu_lengths = [0]
+        # Process each sample and build lists that we will concatenate to create the packed sample.
+        for _, sample in enumerate(samples):
+            sample_len = sample.total_len
+            if sample_len > max_length:
+                max_length = sample_len
+            # If adding this sample exceeds the max length, stop.
+            # This should not happen. The select_samples_to_pack method should have already ensured that the samples fit.
+            if current_length + sample_len > packing_seq_len:
+                raise ValueError(f"Packed sample exceeds the maximum sequence length of {packing_seq_len}: {samples}")
+            # Add the sample's tokens and labels
+            packed_tokens.append(sample.tokens)
+            packed_labels.append(sample.labels)
+            # Add the images
+            packed_imgs += sample.imgs
+            current_length += sample_len
+            cu_lengths.append(current_length)
+        # Concatenate packed tokens and labels.
+        packed_tokens = torch.cat(packed_tokens, dim=0)
+        packed_labels = torch.cat(packed_labels, dim=0)
+        return ImageTaskSamplePacked(
+            __key__=",".join([s.__key__ for s in samples]),
+            __restore_key__=(),  # Will be set by energon based on `samples`
+            __subflavor__=None,
+            __subflavors__=samples[0].__subflavors__,
+            tokens=packed_tokens,
+            labels=packed_labels,
+            imgs=packed_imgs,
+            cu_lengths=torch.tensor(cu_lengths, dtype=torch.int32),
+            max_length=max_length,
+            num_tiles=[n for s in samples for n in s.num_tiles],
+        )
+def print_error_handler(exc: Exception, key: Optional[str]):
+    print(
+        f"The following exception occurred in the dataloader for sample {key} and is skipped",
+        file=sys.stderr,
+    )
+    traceback.print_exc()
+def format_multichoice_question(question, multichoice_options):
+    """Format multi-choice question."""
+    options_text = ["{}. {}\n".format(chr(ord('A') + i), option) for i, option in
+                    zip(range(len(multichoice_options)), multichoice_options)]
+    options_text = "".join(options_text)
+    options_text = f"{options_text}Answer with the option's letter from the given choices directly."
+    return "{}\n{}".format(question, options_text)
+def format_multichoice_answer(idx):
+    """Format multi-choice answer."""
+    return chr(ord('A') + idx)
--- a/Megatron-LM/examples/multimodal/energon_util.py
+++ b/Megatron-LM/examples/multimodal/energon_util.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import torch
+import warnings
+from dataclasses import dataclass
+from typing import Any, List
+from megatron.energon import Sample
+from megatron.energon.epathlib.epath import EPath
+from megatron.energon.flavors.webdataset import DefaultDecoderWebdatasetFactory
+@dataclass
+class SampleListSample(Sample):
+    """Sample type for a list of samples of any type which needs to be packed together.
+    This is useful for datasets which are packed offline.
+    """
+    #: The images of the sequence
+    samples: List[Any]
+class SampleListWebdataset(DefaultDecoderWebdatasetFactory[SampleListSample]):
+    __sample_type__ = SampleListSample
+    def __init__(self, path: EPath, **kwargs):
+        warnings.warn(
+            f"{type(self)} is deprecated, use the default instead and set the sample_type:\n"
+            f"To convert, update your {path}/.nv-meta/dataset.yaml to:\n"
+            f"# remove top-level __module__ and __class__\n"
+            f"sample_type:\n"
+            f"  __module__: megatron.energon\n"
+            f"  __class__: {self.__sample_type__.__name__}\n"
+            f"# Keep the remaining content",
+            DeprecationWarning,
+        )
+        super().__init__(path, **kwargs)
+@dataclass
+class OfflineTargetAspectRatioSample(Sample):
+    """Sample type for image + text samples with target aspect ratio computed offline."""
+    #: The images of the sequence
+    images: List[torch.Tensor]
+    #: The texts of the sequence
+    texts: List[str]
+    target_aspect_ratio: List[List]
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_ai2d.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_ai2d.py
+import argparse
+import json
+from .evaluate_mmmu import get_input_output_paths
+from .evaluate_vqav2 import compute_vqa_accuracy
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="AI2D")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
+                results[sample_id] = {
+                    "question_id": sample_id,
+                    "answer": res["answer"],
+                    "gt_answer": res["gt_answer"],
+                }
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file, indent=4, sort_keys=True)
+    return output_file_path
+def ai2d_eval(input_path):
+    """Run AI2D evaluation."""
+    result_file_path = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file_path, task="AI2D")
+    return avg_acc
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    avg_acc = ai2d_eval(args.input_path)
+    print(f"===== AI2D Accuracy {avg_acc:.2f}% =====")
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_chartqa.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_chartqa.py
+import argparse
+import json
+from .evaluate_mmmu import get_input_output_paths
+from .evaluate_vqav2 import compute_vqa_accuracy
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="ChartQA")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
+                res["question_id"] = sample_id
+                results[sample_id] = res
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file, indent=4, sort_keys=True)
+    return output_file_path
+def chartqa_eval(input_path):
+    """Run ChartQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    return compute_vqa_accuracy(result_file_path, task="ChartQA")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    avg_acc = chartqa_eval(args.input_path)
+    print(f"ChartQA accuracy: {avg_acc:.2f}")
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_coco.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_coco.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import argparse
+import json
+from .evaluate_mmmu import get_input_output_paths
+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+def convert_to_coco_format(input_path):
+    """Convert input files to COCO compatible format."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="captioning")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
+                caption = res["caption"].rstrip(".").lower()
+                results[sample_id] = {
+                    "image_id": sample_id,
+                    "caption": caption,
+                }
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file, indent=4, sort_keys=True)
+    return output_file_path
+def coco_captioning_eval(input_path, groundtruth_file):
+    """Run COCO captioning evaluation."""
+    coco = COCO(groundtruth_file)
+    input_file = convert_to_coco_format(input_path)
+    coco_result = coco.loadRes(input_file)
+    coco_eval = COCOEvalCap(coco, coco_result)
+    # Evaluate on the input subset of images.
+    coco_eval.params["image_id"] = coco_result.getImgIds()
+    coco_eval.evaluate()
+    print("========== COCO captioning scores ==========")
+    for metric, score in coco_eval.eval.items():
+        print(f"{metric} {score * 100:.3f}")
+    return coco_eval.eval['CIDEr']
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path", type=str, required=True, help="Path to groundtruth file"
+    )
+    args = parser.parse_args()
+    coco_captioning_eval(args.input_path, args.groundtruth_path)
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_infovqa.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_infovqa.py
+import argparse
+import json
+from .evaluate_vqav2 import compute_vqa_accuracy
+from .evaluate_mmmu import get_input_output_paths
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="InfoVQA")
+    results = []
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(
+                    {
+                        "question_id": res["sample_id"],
+                        "answer": res["answer"],
+                        "gt_answer": res["gt_answer"],
+                    }
+                )
+    # Make order deterministic.
+    # results = sorted(results, key=lambda d: d["question_id"])
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+    return output_file_path
+def infovqa_eval(input_path):
+    """Run InfoVQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    return compute_vqa_accuracy(result_file_path, task="InfoVQA")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    avg_acc = infovqa_eval(args.input_path)
+    print(f"===== InfoVQA Accuracy {avg_acc:.2f}% =====")
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_mathvista.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_mathvista.py
+import argparse
+import json
+import re
+from .evaluate_mmmu import get_input_output_paths
+from .mmmu_utils import parse_multi_choice_response
+from open_flamingo.eval.vqa_metric import VQAEval
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="MathVista")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+                results[sample_id] = res
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file, indent=4, sort_keys=True)
+    return output_file_path
+def extra_processing(text):
+    """Extra processing."""
+    # Max decimal point capped to 2 decimal point
+    regex = re.compile(r'^\d+\.\d+$')
+    decimal = regex.findall(text)
+    if len(decimal) > 0:
+        non_decimal = len(decimal[0].split(".")[0])
+        # if decimal values are all 0, trim them
+        decimal_digits = [int(d) for d in decimal[0].split(".")[1]]
+        if sum(decimal_digits) == 0:
+            text = decimal[0][:non_decimal]
+        else:
+            text = decimal[0][: non_decimal + 3]
+    # remove % and trailing .
+    text = text.replace("%", "")
+    if text[-1] == ".":
+        text = text[:-1]
+    return text
+def extract_answer(text):
+    """Extract answer."""
+    alphabet = re.findall(r'[a-zA-Z]+', text)
+    if len(alphabet) > 0 and "e+" not in text:
+        template = re.findall(r'answer is -*\d+\.*\d*', text)
+        if len(template) > 0:
+            text = template[0]
+            numbers = re.findall(r'-*\d+\.*\d*', text)
+            text = numbers[0] if len(numbers) > 0 else text
+    return text
+def compute_mathvista_accuracy(result_file):
+    """Compute MathVista accuracy."""
+    merged_results = json.load(open(result_file))
+    vqa = VQAEval(vqa=None, vqaRes=None)
+    acc = 0
+    for res in merged_results:
+        pred_ans = res["answer"]
+        if res["question_type"] == "multi_choice":
+            pred_ans = parse_multi_choice_response(pred_ans, res["all_choices"], res["index2ans"])
+        else:
+            pred_ans = vqa.processPunctuation(pred_ans)
+            pred_ans = vqa.processDigitArticle(pred_ans)
+            # Extra processing and extraction.
+            pred_ans = extra_processing(pred_ans)
+            pred_ans = extract_answer(pred_ans)
+        gt_ans = res["gt_answer"]
+        if isinstance(gt_ans, list):
+            assert len(gt_ans) == 1, f"Expected 1 groundtruth, got {gt_ans}"
+            gt_ans = gt_ans[0]
+        if res["question_type"] != "multi_choice":
+            gt_ans = vqa.processPunctuation(gt_ans)
+            gt_ans = vqa.processDigitArticle(gt_ans)
+            gt_ans = extra_processing(gt_ans)
+        if pred_ans == gt_ans:
+            acc += 1
+    acc = acc / len(merged_results) * 100
+    return acc
+def mathvista_eval(input_path):
+    """Run MathVista evaluation."""
+    result_file_path = merge_input_files(input_path)
+    acc = compute_mathvista_accuracy(result_file_path)
+    return acc
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    acc = mathvista_eval(args.input_path)
+    print(f"===== MathVista accuracy: {acc} =====")
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_mmmu.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_mmmu.py
+import argparse
+import glob
+import json
+import os
+import sys
+import re
+import subprocess
+from .mmmu_utils import parse_multi_choice_response
+# Get the absolute path of the parent directory
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+# Add the parent directory to sys.path
+sys.path.insert(0, parent_dir)
+from run_text_generation import get_output_path
+from config import EvaluationConfig
+def get_input_output_paths(input_path, task):
+    """Get all input files and an output path for a merged file."""
+    # Single input file.
+    if os.path.exists(input_path):
+        input_file_paths = [input_path]
+        output_file_path = input_path.replace(".jsonl", "-merged.json")
+    # Select multiple partitions and dp ranks.
+    else:
+        cfg = EvaluationConfig(task=task, output_path=input_path, partition_id="*")
+        pattern = get_output_path(cfg, dp_rank="*")
+        input_file_paths = glob.glob(pattern)
+        output_file_path = input_path + f"-{task}-merged.json"
+    return input_file_paths, output_file_path
+def extract_answer(text):
+    import re
+    # Regular expression to find content inside \answer{xxx}
+    match = re.search(r'\\answer\{(.*?)\}', text)
+    if match:
+        return match.group(1)  # Return the content inside the braces
+    # Regular expression to find content inside \boxed{xxx}
+    match = re.search(r'\\boxed\{(.*?)\}', text)
+    if match:
+        return match.group(1)  # Return the content inside the braces
+    text = text.replace("Answer:", "Answer: ")
+    return text  # Return the original string if no match is found
+def convert_to_mmmu_format(input_path):
+    """Convert input files to MMMU compatible format."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, "MMMU")
+    output = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                prediction = res["prediction"]
+                if sample_id in output:
+                    continue
+                if res["question_type"] == "multiple-choice":
+                    prediction = extract_answer(prediction)
+                    prediction = parse_multi_choice_response(
+                        prediction, res["all_choices"], res["index2ans"]
+                    )
+                # MMMU eval script expects just a sample_id to prediction mapping.
+                output[sample_id] = prediction
+    with open(output_file_path, "w") as output_file:
+        json.dump(output, output_file, indent=4, sort_keys=True)
+    return output_file_path
+def mmmu_eval(input_path, groundtruth_path):
+    """Run MMMU evaluation."""
+    result_file = convert_to_mmmu_format(input_path)
+    # The MMMU repo has a script for running the actual evaluation but no API. So launching the script here.
+    output = subprocess.run(
+        [
+            "python",
+            "examples/multimodal/MMMU/mmmu/main_eval_only.py",
+            "--output_path",
+            result_file,
+            "--answer_path",
+            groundtruth_path,
+        ],
+        capture_output=True,
+        text=True,
+    )
+    print(output.stderr)
+    print(output.stdout)
+    m = re.search("'Overall': {'num': \d+, 'acc': (\d.\d+)}", output.stdout)
+    return float(m.group(1)) * 100.0
+def main():
+    """Run MMMU evaluation."""
+    # Using the validation groundtruth file from the MMMU repo by default. This assumes you have cloned the MMMU github repo here.
+    default_groundtruth_path = "examples/multimodal/MMMU/mmmu/answer_dict_val.json"
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path",
+        type=str,
+        default=default_groundtruth_path,
+        help="Path to groundtruth file. Defaults to the validation file in the MMMU repo.",
+    )
+    args = parser.parse_args()
+    avg_acc = mmmu_eval(args.input_path, args.groundtruth_path)
+    print(f"MMMU average accuracy: {avg_acc:.2f}")
+if __name__ == "__main__":
+    main()
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_ocrbench.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_ocrbench.py
+import argparse
+import json
+from .evaluate_mmmu import get_input_output_paths
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+                results[sample_id] = res
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file, indent=4, sort_keys=True)
+    return output_file_path
+def compute_ocrbench_score(result_file):
+    """Compute OCRBench score."""
+    merged_results = json.load(open(result_file))
+    # OCRBench score calculation is adopted from https://github.com/Yuliang-Liu/MultimodalOCR/blob/1b7713f44c91f30f64efb6d3e494c416861ef15f/example.py#L1
+    # MIT License. Copyright (c) 2023 Yuliang Liu
+    score = {
+        "Regular Text Recognition": 0,
+        "Irregular Text Recognition": 0,
+        "Artistic Text Recognition": 0,
+        "Handwriting Recognition": 0,
+        "Digit String Recognition": 0,
+        "Non-Semantic Text Recognition": 0,
+        "Scene Text-centric VQA": 0,
+        "Doc-oriented VQA": 0,
+        "Doc-oriented VQA": 0,
+        "Key Information Extraction": 0,
+        "Handwritten Mathematical Expression Recognition": 0,
+    }
+    for res in merged_results:
+        predict = res["answer"]
+        answers = res["gt_answer"]
+        dataset_name = res["dataset_name"]
+        ocr_type = res["data_type"]
+        if dataset_name == "HME100k":
+            if isinstance(answers, list):
+                for j in range(len(answers)):
+                    answer = answers[j].strip().replace("\n", " ").replace(" ", "")
+                    predict = predict.strip().replace("\n", " ").replace(" ", "")
+                    if answer in predict:
+                        score[ocr_type] += 1
+            else:
+                answers = answers.strip().replace("\n", " ").replace(" ", "")
+                predict = predict.strip().replace("\n", " ").replace(" ", "")
+                if answers in predict:
+                    score[ocr_type] += 1
+        else:
+            if isinstance(answers, list):
+                for j in range(len(answers)):
+                    answer = answers[j].lower().strip().replace("\n", " ")
+                    predict = predict.lower().strip().replace("\n", " ")
+                    if answer in predict:
+                        score[ocr_type] += 1
+            else:
+                answers = answers.lower().strip().replace("\n", " ")
+                predict = predict.lower().strip().replace("\n", " ")
+                if answers in predict:
+                    score[ocr_type] += 1
+    recognition_score = (
+        score['Regular Text Recognition']
+        + score['Irregular Text Recognition']
+        + score['Artistic Text Recognition']
+        + score['Handwriting Recognition']
+        + score['Digit String Recognition']
+        + score['Non-Semantic Text Recognition']
+    )
+    final_score = (
+        recognition_score
+        + score['Scene Text-centric VQA']
+        + score['Doc-oriented VQA']
+        + score['Key Information Extraction']
+        + score['Handwritten Mathematical Expression Recognition']
+    )
+    result_log = f"""###########################OCRBench##############################
+Text Recognition(Total 300): {recognition_score}
+------------------Details of Recognition Score-------------------
+Regular Text Recognition(Total 50): {score['Regular Text Recognition']}
+Irregular Text Recognition(Total 50): {score['Irregular Text Recognition']}
+Artistic Text Recognition(Total 50): {score['Artistic Text Recognition']}
+Handwriting Recognition(Total 50): {score['Handwriting Recognition']}
+Digit String Recognition(Total 50): {score['Digit String Recognition']}
+Non-Semantic Text Recognition(Total 50): {score['Non-Semantic Text Recognition']}
+----------------------------------------------------------------
+Scene Text-centric VQA(Total 200): {score['Scene Text-centric VQA']}
+----------------------------------------------------------------
+Doc-oriented VQA(Total 200): {score['Doc-oriented VQA']}
+----------------------------------------------------------------
+Key Information Extraction(Total 200): {score['Key Information Extraction']}
+----------------------------------------------------------------
+Handwritten Mathematical Expression Recognition(Total 100): {score['Handwritten Mathematical Expression Recognition']}
+----------------------Final Score-------------------------------
+Final Score(Total 1000): {final_score}"""
+    return result_log, final_score
+def ocrbench_eval(input_path):
+    """Run OCRBench evaluation."""
+    result_file_path = merge_input_files(input_path)
+    result_log, score = compute_ocrbench_score(result_file_path)
+    return result_log, score
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    result_log, _ = ocrbench_eval(args.input_path)
+    print(result_log)
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_ocrbench_v2.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_ocrbench_v2.py
+import argparse
+import json
+import subprocess
+import nltk
+nltk.download("wordnet")
+from .evaluate_mmmu import get_input_output_paths
+def convert_to_ocrbench_v2_format(input_path, groundtruth_path):
+    """Convert input files to OCRBenchV2 compatible format."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, "OCRBench_v2")
+    output = []
+    with open(groundtruth_path) as f:
+        gt = json.load(f)
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                out = gt[res["sample_id"]]
+                out["predict"] = res["predict"]
+                output.append(out)
+    output = sorted(output, key=lambda x: x["id"])
+    with open(output_file_path, "w") as output_file:
+        json.dump(output, output_file)
+    return output_file_path
+def ocrbench_v2_eval(input_path, groundtruth_path, output_path):
+    """Run OCRBenchV2 evaluation."""
+    result_file = convert_to_ocrbench_v2_format(input_path, groundtruth_path)
+    # The OCRBenchV2 repo has scripts for running the actual evaluation
+    output = subprocess.run(
+        [
+            "python",
+            "examples/multimodal/MultimodalOCR/OCRBench_v2/eval_scripts/eval.py",
+            "--output_path",
+            output_path,
+            "--input_path",
+            result_file,
+        ],
+        capture_output=True,
+        text=True,
+    )
+    print(output.stderr)
+    print(output.stdout)
+    output = subprocess.run(
+        [
+            "python",
+            "examples/multimodal/MultimodalOCR/OCRBench_v2/eval_scripts/get_score.py",
+            "--json_file",
+            output_path,
+        ],
+        capture_output=True,
+        text=True,
+    )
+    print(output.stderr)
+    print(output.stdout)
+def main():
+    """Run OCRBenchV2 evaluation."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path",
+        type=str,
+        required=True,
+        help="Path to groundtruth file",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        required=True,
+        help="Path to dump outputs from the OCRBench V2 eval script",
+    )
+    args = parser.parse_args()
+    ocrbench_v2_eval(args.input_path, args.groundtruth_path, args.output_path)
+if __name__ == "__main__":
+    main()
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_rd_tablebench.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_rd_tablebench.py
+import argparse
+import glob
+import json
+import os
+import re
+import subprocess
+import sys
+import numpy as np
+from .evaluate_mmmu import get_input_output_paths
+# The rd-tablebench repo has functions for grading table predictions.
+# Get the absolute path of the rd-tablebench repo
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'rd-tablebench'))
+# Add the parent directory to sys.path
+sys.path.insert(0, parent_dir)
+from grading import table_similarity
+from convert import html_to_numpy
+def convert_to_rdtablebench_format(input_path):
+    """Convert input files to RDTableBench compatible format."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, "RD_TableBench")
+    output = []
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                output.append(res)
+    output = sorted(output, key=lambda x: x["sample_id"])
+    with open(output_file_path, "w") as output_file:
+        json.dump(output, output_file)
+    return output_file_path
+def rdtablebench_eval(input_path):
+    """Run RD-TableBench evaluation."""
+    result_file = convert_to_rdtablebench_format(input_path)
+    with open(result_file) as f:
+        data = json.load(f)
+    similarities = []
+    num_failed = 0
+    for sample in data:
+        pred = sample["predict"]
+        target = sample["ground_truth"]
+        target_np = html_to_numpy(target)
+        try:
+            pred_np = html_to_numpy(pred)
+            similarity = table_similarity(target_np, pred_np)
+        except Exception as e:
+            print("Failed to grade table: ", e)
+            similarity = 0
+            num_failed += 1
+        similarities.append(similarity)
+    print(f"Accuracy: {np.mean(similarities)}")
+    print(f"Failed: {num_failed}")
+def main():
+    """Run RD-TableBench evaluation."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    args = parser.parse_args()
+    rdtablebench_eval(args.input_path)
+if __name__ == "__main__":
+    main()
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_realworldqa.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_realworldqa.py
+import argparse
+import json
+from .evaluate_vqav2 import compute_vqa_accuracy
+from .evaluate_mmmu import get_input_output_paths
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="RealworldQA")
+    results = []
+    collected = set()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                res["question_id"] = res["sample_id"]
+                if res['sample_id'] in collected:
+                    continue
+                collected.add(res['sample_id'])
+                results.append(res)
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file, indent=4, sort_keys=True)
+    return output_file_path
+def realworldqa_eval(input_path):
+    """Run RealWorldQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    return compute_vqa_accuracy(result_file_path, task="RealworldQA")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    avg_acc = realworldqa_eval(args.input_path)
+    print(f"RealworldQA accuracy: {avg_acc:.2f}")
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_spdocvqa.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_spdocvqa.py
+import argparse
+import json
+from .evaluate_vqav2 import compute_vqa_accuracy
+from .evaluate_mmmu import get_input_output_paths
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="SPDocVQA")
+    results = []
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(
+                    {
+                        "question_id": res["sample_id"],
+                        "answer": res["answer"],
+                        "gt_answer": res["gt_answer"],
+                    }
+                )
+    # Make order deterministic.
+    # results = sorted(results, key=lambda d: d["question_id"])
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+    return output_file_path
+def spdocvqa_eval(input_path):
+    """Run SPDocVQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    return compute_vqa_accuracy(result_file_path, task="SPDocVQA")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    avg_acc = spdocvqa_eval(args.input_path)
+    print(f"===== SPDocVQA Accuracy {avg_acc:.2f}% =====")
--- a/Megatron-LM/examples/multimodal/evaluation/evaluate_textvqa.py
+++ b/Megatron-LM/examples/multimodal/evaluation/evaluate_textvqa.py
+import argparse
+import json
+from .evaluate_mmmu import get_input_output_paths
+from .evaluate_vqav2 import compute_vqa_accuracy
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+                results[sample_id] = {
+                    "question_id": sample_id,
+                    "answer": res["answer"],
+                    "gt_answer": res["gt_answer"],
+                }
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file, indent=4, sort_keys=True)
+    return output_file_path
+def textvqa_eval(input_path):
+    """Run TextVQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file_path, task="TextVQA")
+    return avg_acc
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    avg_acc = textvqa_eval(args.input_path)
+    print(f"===== TextVQA Accuracy {avg_acc:.2f}% =====")