更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/examples/mamba/run_text_gen_server_8b_gpt3.sh
+++ b/examples/mamba/run_text_gen_server_8b_gpt3.sh
--- a/examples/mamba/train.sh
+++ b/examples/mamba/train.sh
-#!/bin/bash
-
-# Use: ./train.sh <data-path> <tokenizer-path>
-
-MODEL_SCALE="800M" # or "8B"
-
-case "${MODEL_SCALE}" in
-    "800M")
-        TENSOR_MODEL_PARALLEL_SIZE=1
-        NUM_LAYERS=48
-        HIDDEN_SIZE=1024
-        NUM_ATTENTION_HEADS=16
-        GLOBAL_BATCH_SIZE=32
-        ;;
-    "8B")
-        TENSOR_MODEL_PARALLEL_SIZE=4
-        NUM_LAYERS=56
-        HIDDEN_SIZE=4096
-        NUM_ATTENTION_HEADS=32
-        GLOBAL_BATCH_SIZE=8
-        ;;
-    *)
-        echo "Invalid version specified"
-        exit 1
-        ;;
-esac
-
-DATA_PATH=$1
-TOKENIZER_PATH=$2
-
-export NCCL_IB_SL=1
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_IB_TIMEOUT=19
-export NCCL_IB_QPS_PER_CONNECTION=4
-
-CHECKPOINT_DIR="./checkpoints"
-DATACACHE_DIR="./data-cache"
-TENSORBOARD_DIR="./tensorboard"
-
-mkdir -p ${CHECKPOINT_DIR}
-mkdir -p ${DATACACHE_DIR}
-mkdir -p ${TENSORBOARD_DIR}
-
-export TRITON_CACHE_DIR="./triton-cache/"
-export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
-
-SEQ_LEN=4096
-TRAIN_SAMPLES=73242188  # 300B tokens / 4096
-LR_WARMUP_SAMPLES=50000
-LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES
-
-options=" \
-       --tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \
-       --sequence-parallel \
-       --pipeline-model-parallel-size 1 \
-       --use-distributed-optimizer \
-       --overlap-param-gather \
-       --overlap-grad-reduce \
-       --untie-embeddings-and-output-weights \
-       --init-method-std 0.02 \
-       --position-embedding-type none \
-       --num-layers ${NUM_LAYERS} \
-       --hidden-size ${HIDDEN_SIZE} \
-       --num-attention-heads ${NUM_ATTENTION_HEADS} \
-       --group-query-attention \
-       --num-query-groups 8 \
-       --hybrid-attention-ratio 0.08 \
-       --hybrid-mlp-ratio 0.5 \
-       --seq-length ${SEQ_LEN} \
-       --max-position-embeddings ${SEQ_LEN} \
-       --train-samples ${TRAIN_SAMPLES} \
-       --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
-       --lr-decay-samples ${LR_DECAY_SAMPLES} \
-       --save ${CHECKPOINT_DIR} \
-       --load ${CHECKPOINT_DIR} \
-       --data-path ${DATA_PATH} \
-       --data-cache-path ${DATACACHE_DIR} \
-       --split 99,1,0 \
-       --tokenizer-type GPTSentencePieceTokenizer \
-       --tokenizer-model ${TOKENIZER_PATH} \
-       --distributed-backend nccl \
-       --micro-batch-size 4 \
-       --global-batch-size ${GLOBAL_BATCH_SIZE} \
-       --lr 2.5e-4 \
-       --min-lr 2.5e-5 \
-       --lr-decay-style cosine \
-       --weight-decay 0.1 \
-       --clip-grad 1.0 \
-       --attention-dropout 0.0 \
-       --hidden-dropout 0.0 \
-       --disable-bias-linear \
-       --normalization RMSNorm \
-       --adam-beta1 0.9 \
-       --adam-beta2 0.95 \
-       --log-interval 10 \
-       --save-interval 2000 \
-       --eval-interval 2000 \
-       --eval-iters 32 \
-       --bf16 \
-       --use-mcore-models \
-       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
-       --no-create-attention-mask-in-dataloader \
-       --tensorboard-dir ${TENSORBOARD_DIR}"
-
-torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options}
+#!/bin/bash
+
+# Use: ./train.sh <data-path> <tokenizer-path>
+
+MODEL_SCALE="800M" # or "8B"
+
+case "${MODEL_SCALE}" in
+    "800M")
+        TENSOR_MODEL_PARALLEL_SIZE=1
+        NUM_LAYERS=48
+        HIDDEN_SIZE=1024
+        NUM_ATTENTION_HEADS=16
+        GLOBAL_BATCH_SIZE=32
+        ;;
+    "8B")
+        TENSOR_MODEL_PARALLEL_SIZE=4
+        NUM_LAYERS=56
+        HIDDEN_SIZE=4096
+        NUM_ATTENTION_HEADS=32
+        GLOBAL_BATCH_SIZE=8
+        ;;
+    *)
+        echo "Invalid version specified"
+        exit 1
+        ;;
+esac
+
+DATA_PATH=$1
+TOKENIZER_PATH=$2
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+CHECKPOINT_DIR="./checkpoints"
+DATACACHE_DIR="./data-cache"
+TENSORBOARD_DIR="./tensorboard"
+
+mkdir -p ${CHECKPOINT_DIR}
+mkdir -p ${DATACACHE_DIR}
+mkdir -p ${TENSORBOARD_DIR}
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+SEQ_LEN=4096
+TRAIN_SAMPLES=73242188  # 300B tokens / 4096
+LR_WARMUP_SAMPLES=50000
+LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES
+
+options=" \
+       --tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \
+       --sequence-parallel \
+       --pipeline-model-parallel-size 1 \
+       --use-distributed-optimizer \
+       --overlap-param-gather \
+       --overlap-grad-reduce \
+       --untie-embeddings-and-output-weights \
+       --init-method-std 0.02 \
+       --position-embedding-type none \
+       --num-layers ${NUM_LAYERS} \
+       --hidden-size ${HIDDEN_SIZE} \
+       --num-attention-heads ${NUM_ATTENTION_HEADS} \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --seq-length ${SEQ_LEN} \
+       --max-position-embeddings ${SEQ_LEN} \
+       --train-samples ${TRAIN_SAMPLES} \
+       --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
+       --lr-decay-samples ${LR_DECAY_SAMPLES} \
+       --save ${CHECKPOINT_DIR} \
+       --load ${CHECKPOINT_DIR} \
+       --data-path ${DATA_PATH} \
+       --data-cache-path ${DATACACHE_DIR} \
+       --split 99,1,0 \
+       --tokenizer-type GPTSentencePieceTokenizer \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --micro-batch-size 4 \
+       --global-batch-size ${GLOBAL_BATCH_SIZE} \
+       --lr 2.5e-4 \
+       --min-lr 2.5e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 0.1 \
+       --clip-grad 1.0 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --adam-beta1 0.9 \
+       --adam-beta2 0.95 \
+       --log-interval 10 \
+       --save-interval 2000 \
+       --eval-interval 2000 \
+       --eval-iters 32 \
+       --bf16 \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --no-create-attention-mask-in-dataloader \
+       --tensorboard-dir ${TENSORBOARD_DIR}"
+
+torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options}
--- a/examples/mixtral/hostfile_mixtral_8x7B
+++ b/examples/mixtral/hostfile_mixtral_8x7B
--- a/run_mixtral8x7B_1nodes.sh
+++ b/run_mixtral8x7B_1nodes.sh
--- a/run_GPT-MOE.sh
+++ b/run_GPT-MOE.sh
@@ -7,13 +7,13 @@ do
    fi
 done

-mpirun -np 256 --hostfile gptnodes \
+mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_GPT-MOE_567B.sh node002 --profiling=$profiling > output.log 2>&1
+              train_mixtral_8x7B_multinodes.sh node066 --profiling=$profiling > output.log 2>&1

 wait

 rm -rf CKPT
-#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
+#rm -rf mixtral_dataset/my-mixtral_text_document
--- a/train_mixtral_8x7B_1nodes.sh
+++ b/train_mixtral_8x7B_1nodes.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done

-source /opt/dtk/env.sh
 # Runs Mixtral 8x7B model
+source /opt/dtk/env.sh
+
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10

+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -75,7 +81,7 @@ MOE_ARGS=(
    --moe-token-dispatcher-type alltoall
    --moe-expert-capacity-factor 0.5
    --moe-pad-expert-input-to-capacity
-    --moe-grouped-gemm
+    #--moe-grouped-gemm
 )

 DATA_ARGS=(
@@ -103,25 +109,17 @@ TRAINING_ARGS=(

 TORCH_PROFIE_ARGS=(
    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7 8
+    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_mixtral_1nodes
+    --profile-dir torch_prof_mixtral_1nodes_tp2-pp1-ep8-ep_tp1
    --use-pytorch-profiler
 )

-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7 8
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
-    --expert-model-parallel-size 2
+    --expert-model-parallel-size 8
    --expert-tensor-parallel-size 1
    --use-distributed-optimizer
    --sequence-parallel
@@ -159,10 +157,6 @@ APP="python3 -u pretrain_gpt.py \

 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi

 #for hygon cpu

--- a/train_mixtral_8x7B_2nodes.sh
+++ b/train_mixtral_8x7B_2nodes.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done

-source /opt/dtk/env.sh
 # Runs Mixtral 8x7B model
+source /opt/dtk/env.sh
+
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10

+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -99,9 +105,6 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    --recompute-granularity full
-    --recompute-method uniform
-    --recompute-num-layers 1
 )

 TORCH_PROFIE_ARGS=(
@@ -109,23 +112,15 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 8 9 10 11 
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_data_mixtral_2nodes
+    --profile-dir torch_prof_mixtral_4nodes_tp2-pp8-ep2-ep_tp1
    --use-pytorch-profiler
 )

-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 8 9 10 11 
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
-    --tensor-model-parallel-size 4
-    --pipeline-model-parallel-size 4
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 8
    --expert-model-parallel-size 2
-    --expert-tensor-parallel-size 2
+    --expert-tensor-parallel-size 1
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -162,10 +157,6 @@ APP="python3 -u pretrain_gpt.py \

 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi

 #for hygon cpu

--- a/examples/mixtral/train_mixtral_8x7b_distributed.sh
+++ b/examples/mixtral/train_mixtral_8x7b_distributed.sh
-#!/bin/bash
-
-# Runs Mixtral 8x7B model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=${MASTER_ADDR:-"localhost"}
-MASTER_PORT=${MASTER_PORT:-"6000"}
-NNODES=${SLURM_NNODES:-"1"}
-NODE_RANK=${RANK:-"0"}
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=$1
-TOKENIZER_MODEL=$2
-DATA_PATH=$3
-
-DISTRIBUTED_ARGS=(
-    --nproc_per_node $GPUS_PER_NODE
-    --nnodes $NNODES
-    --node_rank $NODE_RANK
-    --master_addr $MASTER_ADDR
-    --master_port $MASTER_PORT
-)
-
-MODEL_ARGS=(
-    --use-mcore-models
-    --disable-bias-linear
-    --seq-length 4096
-    --max-position-embeddings 32768
-    --num-layers 32
-    --hidden-size 4096
-    --ffn-hidden-size 14336
-    --num-attention-heads 32
-    --init-method-std 0.01
-    --attention-dropout 0.0
-    --hidden-dropout 0.0
-    --normalization RMSNorm
-    --position-embedding-type rope
-    --swiglu
-    --untie-embeddings-and-output-weights
-    --group-query-attention
-    --num-query-groups 8
-    --no-masked-softmax-fusion
-    --no-position-embedding
-    --rotary-base 1000000
-)
-
-MOE_ARGS=(
-    --num-experts 8
-    --moe-router-topk 2
-    --moe-router-load-balancing-type aux_loss
-    --moe-aux-loss-coeff 1e-2
-    --moe-grouped-gemm
-    --moe-token-dispatcher-type alltoall
-    --overlap-param-gather
-    --overlap-grad-reduce
-)
-
-DATA_ARGS=(
-    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
-    --split 99990,8,2
-)
-
-TRAINING_ARGS=(
-    --micro-batch-size 1
-    --global-batch-size 256
-    --lr 1e-4
-    --train-iters 500000
-    --lr-decay-iters 320000
-    --lr-decay-style cosine
-    --min-lr 1.0e-5
-    --weight-decay 0.1
-    --lr-warmup-iters 500
-    --clip-grad 1.0
-    --bf16
-)
-
-MODEL_PARALLEL_ARGS=(
-    --tensor-model-parallel-size 1
-    --pipeline-model-parallel-size 4
-    --expert-model-parallel-size 8
-    --use-distributed-optimizer
-    --sequence-parallel
-)
-
-LOGGING_ARGS=(
-    --log-interval 1 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH \
-    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
-    --no-load-optim \
-    --no-load-rng
-)
-
-if [ -n "${WANDB_API_KEY}" ]; then
-    LOGGING_ARGS+=(
-        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
-        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
-    )
-fi
-
-
-torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
-    ${MODEL_ARGS[@]} \
-    ${MOE_ARGS[@]} \
-    ${DATA_ARGS[@]} \
-    ${TRAINING_ARGS[@]} \
-    ${MODEL_PARALLEL_ARGS[@]} \
-    ${LOGGING_ARGS[@]}
--- a/examples/multimodal/combine_lm_vision_checkpoints.sh
+++ b/examples/multimodal/combine_lm_vision_checkpoints.sh
-#/bin/bash
-MCORE_LM=$1    # <path_to_mcore_lm_model_folder>
-MCORE_VISION=$2   # <path_to_mcore_vision_model_folder>
-OUTPUT_DIR=$3   # <path_to_output_folder_for_combined_checkpoint>
-MODEL_TYPE=$4   # Model type. Default: Mistral CLIP example.
-
-if [[ $MODEL_TYPE == "nvlm" ]]; then
-    # NVLM TP=8
-    python examples/multimodal/combine_state_dicts.py \
-        --input \
-        ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_04/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_04/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_05/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_05/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_06/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_06/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_07/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_07/model_optim_rng.pt \
-        --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
-        --output \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_04/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_05/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_06/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_07/model_optim_rng.pt
-else
-    # Mistral CLIP example TP=4.
-    python examples/multimodal/combine_state_dicts.py \
-        --input \
-        ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
-        ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
-        ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
-        --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
-        --output \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
-        ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt
-fi
-
-echo 1 > ${OUTPUT_DIR}/latest_checkpointed_iteration.txt
+#/bin/bash
+MCORE_LM=$1    # <path_to_mcore_lm_model_folder>
+MCORE_VISION=$2   # <path_to_mcore_vision_model_folder>
+OUTPUT_DIR=$3   # <path_to_output_folder_for_combined_checkpoint>
+MODEL_TYPE=$4   # Model type. Default: Mistral CLIP example.
+
+if [[ $MODEL_TYPE == "nvlm" ]]; then
+    # NVLM TP=8
+    python examples/multimodal/combine_state_dicts.py \
+        --input \
+        ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_07/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_07/model_optim_rng.pt \
+        --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+        --output \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_07/model_optim_rng.pt
+else
+    # Mistral CLIP example TP=4.
+    python examples/multimodal/combine_state_dicts.py \
+        --input \
+        ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+        --output \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt
+fi
+
+echo 1 > ${OUTPUT_DIR}/latest_checkpointed_iteration.txt
--- a/examples/multimodal/config.py
+++ b/examples/multimodal/config.py
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-from dataclasses import dataclass
-
-import torch
-
-from megatron.training.activations import fast_gelu, quick_gelu, squared_relu
-
-
-def get_language_model_config(config):
-    if config.language_model_type == "llama3_8b":
-        config.activation_func = torch.nn.functional.silu
-        config.add_bias_linear = False
-        config.bias_activation_fusion = False
-        config.gated_linear_unit = True
-        config.apply_query_key_layer_scaling = False
-        config.layernorm_zero_centered_gamma = (
-            False  # Zero centered gamma not supported for RMSNorm
-        )
-        config.bias_dropout_fusion = False
-        config.apply_rope_fusion = False
-        config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 14336
-    elif config.language_model_type == "mistral_7b":
-        config.activation_func = torch.nn.functional.silu
-        config.add_bias_linear = False
-        config.bias_activation_fusion = False
-        config.gated_linear_unit = True
-        config.apply_query_key_layer_scaling = False
-        config.layernorm_zero_centered_gamma = (
-            False  # Zero centered gamma not supported for RMSNorm
-        )
-        config.bias_dropout_fusion = False
-        config.apply_rope_fusion = False
-        config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 14336
-    elif config.language_model_type == "yi-34b":
-        config.activation_func = torch.nn.functional.silu
-        config.add_bias_linear = False
-        config.bias_activation_fusion = False
-        config.gated_linear_unit = True
-        config.apply_query_key_layer_scaling = False
-        config.layernorm_zero_centered_gamma = (
-            False  # Zero centered gamma not supported for RMSNorm
-        )
-        config.bias_dropout_fusion = False
-        config.apply_rope_fusion = False
-        config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 20480
-    elif config.language_model_type == "qwen2.5_7B":
-        config.activation_func = torch.nn.functional.silu
-        config.add_bias_linear = False
-        config.add_qkv_bias = True
-        config.bias_activation_fusion = False
-        config.gated_linear_unit = True
-        config.apply_query_key_layer_scaling = False
-        config.layernorm_zero_centered_gamma = (
-            False  # Zero centered gamma not supported for RMSNorm
-        )
-        config.bias_dropout_fusion = False
-        config.apply_rope_fusion = False
-        config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 18944
-    elif config.language_model_type == "qwen2.0_72B":
-        config.activation_func = torch.nn.functional.silu
-        config.add_bias_linear = False
-        config.add_qkv_bias = True
-        config.bias_activation_fusion = False
-        config.gated_linear_unit = True
-        config.apply_query_key_layer_scaling = False
-        config.layernorm_zero_centered_gamma = (
-            False  # Zero centered gamma not supported for RMSNorm
-        )
-        config.bias_dropout_fusion = False
-        config.apply_rope_fusion = False
-        config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 29568
-    else:
-        raise ValueError(f"unknown language model type {config.language_model_type}")
-
-    return config
-
-
-def get_vision_model_config(config, apply_query_key_layer_scaling):
-    if config.vision_model_type == "clip":
-        config.num_layers = 24
-        config.num_attention_heads = 16
-        config.add_bias_linear = True
-        config.add_qkv_bias = True
-        config.hidden_size = 1024
-        config.hidden_dropout = 0.0
-        config.attention_dropout = 0.0
-        config.ffn_hidden_size = 4096
-        config.gated_linear_unit = False
-        config.activation_func = quick_gelu
-        config.kv_channels = 64
-        config.num_query_groups = 16
-        config.layernorm_zero_centered_gamma = False
-        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
-        config.bias_activation_fusion = False
-        config.bias_dropout_fusion = False
-        config.attention_softmax_in_fp32 = True
-        config.normalization = 'LayerNorm'
-        config.apply_rope_fusion = False
-    elif config.vision_model_type == "siglip":
-        config.num_layers = 27
-        config.num_attention_heads = 16
-        config.add_bias_linear = True
-        config.add_qkv_bias = True
-        config.hidden_size = 1152
-        config.hidden_dropout = 0.0
-        config.attention_dropout = 0.0
-        config.ffn_hidden_size = 4304
-        config.gated_linear_unit = False
-        config.activation_func = fast_gelu
-        config.kv_channels = 72
-        config.num_query_groups = 16
-        config.layernorm_zero_centered_gamma = False
-        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
-        config.bias_activation_fusion = False
-        config.bias_dropout_fusion = False
-        config.attention_softmax_in_fp32 = True
-        config.normalization = 'LayerNorm'
-        config.apply_rope_fusion = False
-        config.qk_layernorm = False
-        config.layernorm_epsilon = 1e-6
-    elif config.vision_model_type == "internvit":
-        config.num_layers = 45
-        config.num_attention_heads = 32     # Padded for TP=8.
-        config.num_query_groups = 32    # Padded for TP=8.
-        config.kv_channels = 128
-        config.add_bias_linear = True
-        config.add_qkv_bias = False
-        config.hidden_size = 3200
-        config.hidden_dropout = 0.0
-        config.attention_dropout = 0.0
-        config.ffn_hidden_size = 12800
-        config.gated_linear_unit = False
-        config.activation_func = torch.nn.functional.gelu
-        config.layernorm_zero_centered_gamma = False
-        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
-        config.bias_activation_fusion = False
-        config.bias_dropout_fusion = False
-        config.attention_softmax_in_fp32 = True
-        config.normalization = 'RMSNorm'
-        config.layernorm_epsilon = 1e-6
-        config.apply_rope_fusion = False
-    else:
-        raise ValueError(f"unknown vision model type {config.vision_model_type}")
-
-    return config
-
-
-def get_vision_projection_config(config, hidden_size):
-    config.gated_linear_unit = False
-    config.bias_activation_fusion = False
-    config.add_bias_linear = False
-    config.hidden_size = hidden_size  # Used as the vision projection output size, i.e., the input to the language model.
-    if config.language_model_type == "llama3_8b":
-        config.ffn_hidden_size = 14336
-        config.activation_func = torch.nn.functional.gelu
-    elif config.language_model_type == "mistral_7b":
-        config.ffn_hidden_size = 14336
-        config.activation_func = torch.nn.functional.gelu
-        config.normalization = None
-    elif config.language_model_type == "yi-34b":
-        config.ffn_hidden_size = 20480
-        config.normalization = "LayerNorm"
-        config.activation_func = torch.nn.functional.gelu
-    elif config.language_model_type == "qwen2.5_7B":
-        config.ffn_hidden_size = 3584
-        config.activation_func = torch.nn.functional.gelu
-    elif config.language_model_type == "qwen2.0_72B":
-        config.ffn_hidden_size = 29568
-        config.normalization = "LayerNorm"
-        config.activation_func = torch.nn.functional.gelu
-    else:
-        raise ValueError(f"unknown language model type {config.language_model_type}")
-
-    return config
-
-
-@dataclass
-class EvaluationConfig:
-    """Evaluation related configuration."""
-    task: str
-
-    temperature: float = 1.0
-    top_p: float = 0.0
-    top_k: int = 0
-
-    out_seq_length: int = 32
-
-    output_path: str = ""
-
-    input_image_path: str = ""
-    gt_path: str = ""
-
-    num_partitions: int = 1
-    partition_id: int = 0
-    num_samples_per_partition: int = 0
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from dataclasses import dataclass
+
+import torch
+
+from megatron.training.activations import fast_gelu, quick_gelu, squared_relu
+
+
+def get_language_model_config(config):
+    if config.language_model_type == "llama3_8b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "llama3.1_8b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "llama3.1_70B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 28672
+    elif config.language_model_type == "mistral_7b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "yi-34b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 20480
+    elif config.language_model_type == "qwen2.5_7B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.add_qkv_bias = True
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 18944
+    elif config.language_model_type == "qwen2.0_72B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.add_qkv_bias = True
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 29568
+    elif config.language_model_type == "llama3.2_1b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 8192
+    elif config.language_model_type.startswith("huggingface"):
+        # Loaded from HuggingFace config file.
+        pass
+    else:
+        raise ValueError(f"unknown language model type {config.language_model_type}")
+
+    return config
+
+
+def get_vision_model_config(config, apply_query_key_layer_scaling):
+    if config.vision_model_type == "clip":
+        config.num_layers = 24
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1024
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4096
+        config.gated_linear_unit = False
+        config.activation_func = quick_gelu
+        config.kv_channels = 64
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+    elif config.vision_model_type == "siglip":
+        config.num_layers = 27
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1152
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4304
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 72
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
+    elif config.vision_model_type == "internvit":
+        config.num_layers = 45
+        config.num_attention_heads = ((24 // config.tensor_model_parallel_size) + 1) * config.tensor_model_parallel_size
+        config.num_query_groups = config.num_attention_heads
+        config.add_bias_linear = True
+        config.add_qkv_bias = False
+        config.hidden_size = 3200
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 12800
+        config.gated_linear_unit = False
+        config.activation_func = torch.nn.functional.gelu
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'RMSNorm'
+        config.layernorm_epsilon = 1e-6
+        config.apply_rope_fusion = False
+    elif config.vision_model_type == "radio":
+        config.num_layers = 32
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1280
+        config.ffn_hidden_size = 5120
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 80
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
+    elif config.vision_model_type.startswith("huggingface"):
+        # Loaded from HuggingFace config file.
+        pass
+    else:
+        raise ValueError(f"unknown vision model type {config.vision_model_type}")
+
+    return config
+
+
+def get_vision_projection_config(config, hidden_size):
+    config.gated_linear_unit = False
+    config.bias_activation_fusion = False
+    config.add_bias_linear = False
+    config.hidden_size = hidden_size  # Used as the vision projection output size, i.e., the input to the language model.
+    if config.language_model_type == "llama3_8b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "llama3.1_8b":
+        config.ffn_hidden_size = 4096
+        config.activation_func = torch.nn.functional.gelu
+        config.layernorm_epsilon = 1e-5
+        config.add_bias_linear = True
+        config.normalization = "LayerNorm"
+    elif config.language_model_type == "mistral_7b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.gelu
+        config.normalization = None
+    elif config.language_model_type == "yi-34b":
+        config.ffn_hidden_size = 20480
+        config.normalization = "LayerNorm"
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.5_7B":
+        config.ffn_hidden_size = 3584
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.0_72B":
+        config.ffn_hidden_size = 29568
+        config.normalization = "LayerNorm"
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "llama3.2_1b":
+        config.ffn_hidden_size = 2048
+        config.activation_func = torch.nn.functional.gelu
+        config.normalization = "LayerNorm"
+    elif config.language_model_type.startswith("huggingface"):
+        config.activation_func = torch.nn.functional.gelu
+        from transformers import AutoConfig
+        hf_config = AutoConfig.from_pretrained(config.huggingface_model_name_or_path)
+        if "qwen" in hf_config.model_type:
+            config.ffn_hidden_size = 1536
+    else:
+        raise ValueError(f"unknown language model type {config.language_model_type}")
+
+    return config
+
+
+@dataclass
+class EvaluationConfig:
+    """Evaluation related configuration."""
+    task: str
+
+    temperature: float = 1.0
+    top_p: float = 0.0
+    top_k: int = 0
+
+    out_seq_length: int = 32
+
+    output_path: str = ""
+
+    input_image_path: str = ""
+    gt_path: str = ""
+
+    num_partitions: int = 1
+    partition_id: int = 0
+    num_samples_per_partition: int = 0
--- a/examples/multimodal/dataset_helpers.py
+++ b/examples/multimodal/dataset_helpers.py
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-import bisect
-import dataclasses
-import json
-import re
-import sys
-import traceback
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
-
-from image_processing import get_visual_transform
-from PIL import Image
-from torchvision.transforms import ToPILImage
-import numpy as np
-import torch
-
-from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN, VIDEO_TOKEN
-from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
-from megatron.energon import (
-    Batch,
-    CaptioningSample,
-    DefaultTaskEncoder,
-    OCRSample,
-    Sample,
-    SimilarityInterleavedSample,
-    VQASample,
-    MultiChoiceVQASample
-)
-from megatron.energon.task_encoder.base import stateless
-from megatron.training import get_args, get_tokenizer
-
-
-@dataclass
-class ImageTaskSample(Sample):
-    __key__: str
-    __restore_key__: Tuple[Union[str, int, tuple], ...]
-    __subflavor__: Dict
-    __subflavors__: Dict
-    # (c, h, w)
-    imgs: List[torch.Tensor]
-    num_tiles: List[int]
-    tokens: torch.Tensor
-    total_len: int  # Total token count in the sample, including text and image tokens
-    labels: torch.Tensor = None
-
-
-@dataclass
-class ImageTaskSamplePacked(Sample):
-    """Dataclass to store a single packed sample (not a batch).
-
-        P = Number of sub-samples in the packed sample
-        seq_len = Total sequence length
-        num_imgs = Number of images across all samples in the packed sample
-    """
-
-    __key__: str    # Sample name
-    __restore_key__: Tuple[Union[str, int, tuple], ...]
-    __subflavor__: Dict     # Sample metadata. Deprecated.
-    __subflavors__: Dict    # Sample metadata.
-    tokens: torch.Tensor  # Input tokens packed into a single tensor (seq_len,)
-    labels: torch.Tensor # Target tokens packed into a single tensor (seq_len,)
-    imgs: List[torch.Tensor]    # Input images
-    num_tiles: List[int]  # Number of tiles for each image of each sample (num_imgs)
-    max_length: int    # Maximum length across sub-samples.
-    cu_lengths: List[int]  # Cumulative length of each sub-sample in this packed sample incl. text and image tokens (P,)
-
-
-# Typing for the resulting batch data after encode_batch()
-@dataclass
-class ImageTaskBatchPacked(Batch):
-    """Dataclass to store a batch of packed samples.
-
-        N = Batch size
-        P = Number of samples in the packed sample
-        seq_len = Maximum sequence length
-        num_imgs = Number of images across all samples in the packed sample
-    """
-
-    __key__: List[str]  # Sample names
-    __restore_key__: Tuple[Union[str, int, tuple], ...]
-    __subflavor__: Dict     # Sample metadata. Deprecated.
-    __subflavors__: List[Dict]  # Sample metadatas.
-    tokens: torch.Tensor  # Input tokens packed and padded (N, seq_len)
-    labels: torch.Tensor # Target tokens packed and padded (N, seq_len)
-    imgs: torch.Tensor  # All image tiles stacked into a single tensor (num_tiles, C, H, W)
-    num_tiles: List[List[int]]  # Number of tiles per image (N, num_imgs)
-    max_lengths: List[int]  # Maximum length across sub-samples (N,)
-    cu_lengths: List[List[int]]  # Cumulative length of each sub-sample in each packed sample of the batch (N, P)
-
-
-# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L19
-# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
-def search_for_fit(numbers: List[int], capacity: int) -> int:
-    """Finds the index of largest number that fits into the knapsack with the given capacity."""
-    index = bisect.bisect(numbers, capacity)
-    return -1 if index == 0 else (index - 1)
-
-
-# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L27
-# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
-def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: int) -> List:
-    """Greedy algorithm with binary search for the knapsack problem.
-
-    Pack as many samples as possible given a maximum capacity and capacities of individual samples.
-    Used if sequence packing is enabled.
-    """
-    assert len(item_sizes) == len(samples), "sample lengths and samples must have the same length."
-
-    knapsacks = []
-
-    if len(item_sizes) == 0:
-        return knapsacks
-
-    # Sort sample lengths and samples together.
-    sorted_item_sizes, sorted_samples = zip(*sorted(zip(item_sizes, samples), key=lambda x: x[0]))
-    sorted_item_sizes = list(sorted_item_sizes)
-    sorted_samples = list(sorted_samples)
-
-    # Check if all samples fit in the knapsack capacity.
-    if sorted_item_sizes[-1] > max_capacity:
-        raise ValueError(f"knapsack: A sample is larger {sorted_item_sizes[-1]} than the max_sequence_length {max_capacity}.")
-
-    while sorted_item_sizes:
-        current_knapsack = []
-        remaining_capacity = max_capacity
-
-        while True:
-            idx = search_for_fit(sorted_item_sizes, remaining_capacity)
-            if idx == -1:
-                break   # Can't fit more samples.
-
-            remaining_capacity -= sorted_item_sizes[idx]
-
-            sorted_item_sizes.pop(idx)
-            sample = sorted_samples.pop(idx)
-            current_knapsack.append(sample)
-
-        knapsacks.append(current_knapsack)
-
-    return knapsacks
-
-
-class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, dict]):
-    """A simple task encoder for VLMs."""
-
-    def __init__(
-        self
-    ):
-        super().__init__()
-
-        self.args = get_args()
-
-        self.tokenizer = get_tokenizer()
-        with open(self.args.prompt_path, "r") as f:
-            self.manual_prompts = json.load(f)
-        self.dataloader_seq_length = self.args.dataloader_seq_length  # Always return samples of this length.
-        self.packing_seq_length = self.args.packing_seq_length     # Packing sequence length, if packing is enabled.
-        self.is_packing_enabled = self.args.packing_buffer_size is not None and self.args.packing_buffer_size > 0
-
-        if self.dataloader_seq_length and self.packing_seq_length:
-            assert self.dataloader_seq_length >= self.packing_seq_length, "dataloader sequence length must be greater than or equal to the packing sequence length"
-
-        if self.is_packing_enabled:
-            assert self.packing_seq_length > 0, "packing sequence length must be set"
-
-        self.num_image_embeddings_per_tile = get_num_image_embeddings(
-            self.args.img_h,
-            self.args.img_w,
-            self.args.patch_dim,
-            self.args.vision_model_type,
-            self.args.disable_vision_class_token,
-            1,
-            self.args.pixel_shuffle,
-            self.args.use_tile_tags,
-        )
-
-        self.txt_to_token_dict = {}
-
-        self.img_h, self.img_w = self.args.img_h, self.args.img_w
-
-        # This map is used to reduce the number of tiles used per image if the number of tokens is
-        # larger than the decoder_seq_length.
-        self.num_tiles_degradation_map = {12:8, 8:6, 6:4, 4:2, 2:1, 1:1}
-
-    def _get_total_seq_length(self, input_ids, num_tiles):
-        """Calculate expected sequence length given text tokens length and number of tiles."""
-        total_num_images = len(num_tiles)
-        total_num_tiles = sum(num_tiles)
-        total_len = len(input_ids) + total_num_tiles * self.num_image_embeddings_per_tile - total_num_images
-        return total_len
-
-    def _truncate_for_packing(self, input_ids, target, num_tiles):
-        """Truncate tokens and labels if they exceed packing sequence length."""
-        total_num_images = len(num_tiles)
-        total_num_tiles = sum(num_tiles)
-        total_img_embeddings_len = total_num_tiles * self.num_image_embeddings_per_tile
-        max_text_tokens = self.packing_seq_length - total_img_embeddings_len + total_num_images
-
-        input_ids = input_ids[:max_text_tokens]
-        target = target[:max_text_tokens]
-
-        # If truncate causes all labels to be ignored, then skip the sample
-        if (target == IGNORE_INDEX).all():
-            raise ValueError(f"all targets will be ignored after truncation: {input_ids}")
-
-        return input_ids, target
-
-    @stateless(restore_seeds=True)
-    def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQASample, SimilarityInterleavedSample]):
-        if isinstance(sample, OCRSample):
-            if "pdfa" in sample.__key__:
-                yield self.combined_ocr_encoder(sample, task_type='encode_pdf')
-            elif "multi" in sample.__key__:
-                yield self.combined_ocr_encoder(sample, task_type='_encode_ocr')
-            else:
-                yield self.combined_ocr_encoder(sample, task_type='encode_ocr_ref')
-        elif isinstance(sample, CaptioningSample):
-            yield self.encode_captioning(sample)
-        elif isinstance(sample, VQASample):
-            is_llava_training = sample.__subflavors__["is_llava_training"] if "is_llava_training" in sample.__subflavors__ else False
-
-            if "llava" in sample.__key__ or is_llava_training:
-                yield self.encode_llava_pretrain(sample)
-            else:
-                yield self.encode_any_single_turn_vqa(sample)
-        elif isinstance(sample, SimilarityInterleavedSample):
-            yield self.encode_llava_sft(sample)
-        elif isinstance(sample, MultiChoiceVQASample):
-            yield self.encode_any_single_turn_vqa(sample)
-        else:
-            raise NotImplementedError("Sample format not supported", sample)
-
-    def encode_captioning(self, sample: CaptioningSample):
-        """Encode CaptioningSample."""
-        augment = sample.__subflavors__.get("augmentation")
-
-        imgs = get_visual_transform(
-            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
-            self.args.vision_model_type,
-        )
-        num_tiles = [len(imgs)]
-
-        prompt_list = self.manual_prompts["CaptioningPretraining"]["raw"]
-
-        prompt_idx = np.random.randint(len(prompt_list))
-        cur_prompt = prompt_list[prompt_idx]
-        cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt + "\n"
-
-        caption = sample.caption.strip()
-
-        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
-        if split_by_line_flag:
-            caption_list = caption.split('\n')
-            caption = np.random.choice(caption_list)
-
-        conv = [
-            # Note: no system message.
-            {"role": "user", "content": cur_prompt},
-            {"role": "assistant", "content": caption},
-        ]
-
-        input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
-
-        if self.is_packing_enabled:
-            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
-
-        return ImageTaskSample(
-            __key__=sample.__key__,
-            __restore_key__=sample.__restore_key__,
-            __subflavor__=None,
-            __subflavors__=sample.__subflavors__,
-            imgs=imgs,
-            num_tiles=num_tiles,
-            tokens=torch.tensor(input_ids),
-            labels=torch.tensor(target),
-            total_len=self._get_total_seq_length(input_ids, num_tiles),
-        )
-
-    def encode_llava_pretrain(self, sample: VQASample):
-        """Encode pretrain sample in LLAVA style."""
-        augment = sample.__subflavors__.get("augmentation", False)
-
-        imgs = get_visual_transform(
-            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
-            self.args.vision_model_type,
-        )
-        num_tiles = [len(imgs)]
-
-        # LLAVA training: override text-prompt with just the image.
-        conv = [
-            # Note: no system message.
-            {"role": "user", "content": IMAGE_TOKEN + "\n"},
-            {"role": "assistant", "content": sample.answers},
-        ]
-
-        input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
-
-        if self.is_packing_enabled:
-            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
-
-        return ImageTaskSample(
-            __key__=sample.__key__,
-            __restore_key__=sample.__restore_key__,
-            __subflavor__=None,
-            __subflavors__=sample.__subflavors__,
-            imgs=imgs,
-            num_tiles=num_tiles,
-            tokens=torch.tensor(input_ids),
-            labels=torch.tensor(target),
-            total_len=self._get_total_seq_length(input_ids, num_tiles),
-        )
-
-    def encode_llava_sft(self, sample: SimilarityInterleavedSample):
-        """Encode SFT sample."""
-        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
-        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
-
-        has_image = False
-        if hasattr(sample, "images"):
-            # If this is a text-only sample and we are freezing the LM,
-            # then use a dummy input image.
-            if len(sample.images) == 0 and self.args.freeze_LM:
-                empty_img = Image.new('RGB', (self.args.img_w, self.args.img_h), (255, 255, 255))
-                sample.images.append(empty_img)
-            if len(sample.images) > 0 and not has_video:
-                has_image = True
-
-        # Note: Some tokenizers may ignore the system prompt.
-        conversation = [{"role": "system", "content": "Answer the questions."}]
-        # Format the conversation as a list of "user" / "assistant" turns.
-        for text in sample.texts:
-            error_msg = f"unexpected role {text['from']} in {sample.texts}"
-            assert text["from"] in ["human", "gpt"], error_msg
-            conversation.append({
-                "role": "user" if text["from"] == "human" else "assistant",
-                "content": text["value"]})
-
-        # Replace the image tags <image-idx> with IMAGE_TOKEN and count the number of image tags
-        number_image_tags = 0
-        image_tag_ids_list = []
-        for turn in conversation:
-            if turn["role"] == "user":
-                image_tag_ids = [int(x) - 1 for x in re.findall(r"<image-(\d+)>", turn["content"])]
-                image_tag_ids_list.extend(image_tag_ids)
-                turn["content"] = re.sub(r"<image-\d+>", IMAGE_TOKEN, turn["content"])
-                number_image_tags += turn["content"].count(IMAGE_TOKEN)
-                # For videos, we replace the image tag with the video tag
-                if has_video:
-                    turn["content"] = turn["content"].replace(IMAGE_TOKEN, VIDEO_TOKEN)
-
-        # We re-order the images in sample.images according to how they appear in the conversation.
-        if len(image_tag_ids_list) > 0:
-            sample.images = [sample.images[idx] for idx in image_tag_ids_list]
-
-        # If there is only one image, but several image tags, we assume all the tags refer to the
-        # same image and duplicate the image:
-        if len(sample.images) == 1 and number_image_tags > 1:
-            sample.images = sample.images * number_image_tags
-
-        number_of_images = len(sample.images)
-        # Fail if there are more image or video tags than image or videos:
-        error_msg = (
-            f"Found {number_image_tags} image tags for {number_of_images} images. {sample.texts}")
-        assert number_image_tags <= number_of_images, error_msg
-
-        # If there are less image of video tags than image or videos, prepend the tags to the first
-        # user message:
-        if number_image_tags < number_of_images:
-            for turn in conversation:
-                if turn["role"] == "user":
-                    tag_to_add = VIDEO_TOKEN if has_video else IMAGE_TOKEN
-                    turn["content"] = tag_to_add*(number_of_images-number_image_tags) + "\n" + turn["content"]
-                    break
-
-        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
-
-        if has_image:
-            imgs = []
-            num_tiles = []
-            max_num_tiles = self.args.max_num_tiles
-            # We keep a buffer of 4 tokens for the question,
-            # the rest can be used for image tokens.
-            max_image_token_allowed = self.args.decoder_seq_length - len(input_ids) - 4
-            # We start by extracting as many tiles per image as possible, and decrease the max
-            # number of tiles if there are too many image tokens.
-            while True:
-                imgs = []
-                num_tiles = []
-                for img in sample.images:
-                    img_tiles = get_visual_transform(
-                        img, self.img_h, self.img_w, self.args.use_tiling, max_num_tiles,
-                        self.args.use_thumbnail, augment, self.args.vision_model_type)
-                    imgs += img_tiles
-                    num_tiles += [len(img_tiles)]
-                if max_num_tiles == 1:
-                    break
-                if sum(num_tiles) * self.token_per_img_tile > max_image_token_allowed:
-                    if max_num_tiles in self.num_tiles_degradation_map:
-                        max_num_tiles = self.num_tiles_degradation_map[max_num_tiles]
-                    else:
-                        raise RuntimeError((
-                            f"Tried to decrease the number of tiles {max_num_tiles} but it's not ",
-                            f"defined in the degradation map {self.num_tiles_degradation_map}"))
-                else:
-                    break
-        elif has_video:
-            # We don't use tiling for videos to limit the number of tokens.
-            use_tiling=False
-            # Grab the selected frames of the video as a tensor with shape
-            # fhwc: (num_frames, num_channels, height, width).
-            video_fchw = sample.images[0].permute(0, 1, 2, 3)
-            selected_frames = torch.linspace(
-                0, video_fchw.shape[0] - 1, self.args.num_frames).long()
-            video_fchw = video_fchw[selected_frames]
-            imgs = []
-            for video_chw in video_fchw:
-                to_pil = ToPILImage()
-                video_chw = to_pil(video_chw)
-                imgs += get_visual_transform(
-                    video_chw, self.img_h, self.img_w, use_tiling, self.args.max_num_tiles,
-                    self.args.use_thumbnail, augment, self.args.vision_model_type)
-            num_tiles = [len(imgs)]
-        else:
-            imgs = num_tiles = []
-
-        if self.is_packing_enabled:
-            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
-
-        # Some final checks with respect to the number of image tokens and images on the tokenized
-        # conversation. There can still be errors, for instance if a non-video sample happens to
-        # have our pre-defined video token, or if the packing truncation removed a necessary image
-        # tag.
-        number_image_token = np.sum(input_ids == self.img_token_id)
-        error_msg = (
-            f"Found {number_image_token} image tokens for len({num_tiles}) = {len(num_tiles)} image tiles in {conversation}.")
-        assert number_image_token == len(num_tiles), error_msg
-        error_msg = (
-            f"Found sum({num_tiles}) = {np.sum(num_tiles)} tiles for {len(imgs)} images in {conversation}.")
-        assert np.sum(num_tiles) == len(imgs), error_msg
-
-        return ImageTaskSample(
-            __key__=sample.__key__,
-            __restore_key__=sample.__restore_key__,
-            __subflavor__=None,
-            __subflavors__=sample.__subflavors__,
-            imgs=imgs,
-            num_tiles=num_tiles,
-            tokens=torch.tensor(input_ids),
-            labels=torch.tensor(target),
-            total_len=self._get_total_seq_length(input_ids, num_tiles),
-        )
-
-    def encode_any_single_turn_vqa(self, sample):
-        """Encode MultiChoiceVQA or VQA sample."""
-        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
-        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
-
-        if has_video:
-            # Grab the selected frames of the video as a tensor with shape
-            # fhwc: (num_frames, height, width, num_channels).
-            video_fhwc = sample.image.permute(0, 2, 3, 1)
-            selected_frames = torch.linspace(
-                0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
-            video_frame_fhwc = video_fhwc[selected_frames]
-            imgs = []
-            for video_frame_hwc in video_frame_fhwc:
-                imgs += get_visual_transform(
-                    video_frame_hwc, self.img_h, self.img_w,
-                    self.args.use_tiling, self.args.max_num_tiles,
-                    self.args.use_thumbnail, augment, self.args.vision_model_type)
-        else:
-            imgs = get_visual_transform(
-                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
-                self.args.use_thumbnail, augment, self.args.vision_model_type,
-            )
-
-        num_tiles = [len(imgs)]
-
-        if isinstance(sample, MultiChoiceVQASample):
-            cur_prompt = format_multichoice_question(sample.context, sample.choices)
-            if IMAGE_TOKEN not in cur_prompt:
-                cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
-            cur_answer = format_multichoice_answer(sample.correct_choice_idx)
-        elif isinstance(sample, VQASample):
-            if 'docvqa' in sample.__key__:
-                prompt_list = self.manual_prompts["VQASFT"]["docvqa"]
-            elif sample.__subflavors__.get("VQASFT"):
-                prompt_list = self.manual_prompts["VQASFT"]["raw"]
-            else:
-                prompt_list = ["{}"]
-
-            prompt_idx = np.random.randint(len(prompt_list))
-            cur_prompt = prompt_list[prompt_idx]
-
-            cur_prompt = cur_prompt.format(sample.context)
-
-            if IMAGE_TOKEN not in cur_prompt:
-                cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
-
-            if isinstance(sample.answers, list):
-                answer_list = sample.answers
-                weight_list = np.array(sample.answer_weights).astype(np.float32)
-                weight_list = weight_list / np.sum(weight_list)
-                answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0]
-                cur_answer = answer_list[answer_idx]
-            else:
-                cur_answer = sample.answers
-        else:
-            raise NotImplementedError("Unsupported data type provided", sample)
-
-        conversation = [
-            {"role": "system", "content": "Answer the questions."},
-            {"role": "user", "content": cur_prompt},
-            {"role": "assistant", "content": str(cur_answer)},
-        ]
-
-        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
-
-        if self.is_packing_enabled:
-            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
-
-        return ImageTaskSample(
-            __key__=sample.__key__,
-            __restore_key__=sample.__restore_key__,
-            __subflavor__=None,
-            __subflavors__=sample.__subflavors__,
-            imgs=imgs,
-            num_tiles=num_tiles,
-            tokens=torch.tensor(input_ids),
-            labels=torch.tensor(target),
-            total_len=self._get_total_seq_length(input_ids, num_tiles),
-        )
-
-    def combined_ocr_encoder(self, sample, task_type):
-        """Encode OCR samples."""
-        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
-
-        if task_type == "encode_pdf":
-            sample, cur_prompt, cur_answer = self.encode_pdf_prompt(sample)
-        elif task_type == "encode_ocr_ref":
-            sample, cur_prompt, cur_answer = self.encode_ocr_ref_prompt(sample)
-        elif task_type == "_encode_ocr":
-            sample, cur_prompt, cur_answer = self.encode_ocr_prompt(sample)
-
-        imgs = get_visual_transform(
-                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
-                self.args.use_thumbnail, augment, self.args.vision_model_type,
-            )
-        num_tiles = [len(imgs)]
-
-        conversation = [
-            {"role": "system", "content": "Answer the questions."},
-            {"role": "user", "content": cur_prompt},
-            {"role": "assistant", "content": str(cur_answer)},
-        ]
-
-        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
-
-        if self.is_packing_enabled:
-            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
-
-        return ImageTaskSample(
-            __key__=sample.__key__,
-            __restore_key__=sample.__restore_key__,
-            __subflavor__=None,
-            __subflavors__=sample.__subflavors__,
-            imgs=imgs,
-            num_tiles=num_tiles,
-            tokens=torch.tensor(input_ids),
-            labels=torch.tensor(target),
-            total_len=self._get_total_seq_length(input_ids, num_tiles),
-        )
-
-    def encode_pdf_prompt(self, sample: OCRSample) -> ImageTaskSample:
-        """Encode OCR sample."""
-        prompt_list = self.manual_prompts["DocPretraining"]["raw"]
-        prompt_idx = np.random.randint(len(prompt_list))
-        cur_prompt = prompt_list[prompt_idx]
-        if IMAGE_TOKEN not in cur_prompt:
-            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
-
-        # Make sure there is no extra IMAGE_TOKEN tag.
-        sample.text = sample.text.replace(IMAGE_TOKEN, "")
-
-        caption = sample.text.strip()
-
-        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
-        if split_by_line_flag:
-            caption_list = caption.split('\n')
-            caption = np.random.choice(caption_list)
-        cur_answer = caption
-
-        return sample, cur_prompt, cur_answer
-
-    def encode_ocr_ref_prompt(self, sample: OCRSample) -> ImageTaskSample:
-        """Encode OCR sample."""
-        ref = sample.text
-        region = sample.words_boxes
-
-        # Make sure there is no extra IMAGE_TOKEN tag
-        ref = ref.replace(IMAGE_TOKEN, "")
-
-        if len(region) == 4:
-            region = f"<box>({region[0]},{region[1]}),({region[2]},{region[3]})</box>"
-        else:
-            region = f"<quad>({region[0]},{region[1]}),({region[2]},{region[3]}),({region[4]},{region[5]}),({region[6]},{region[7]})</quad>"
-
-        # Randomly choose between two tasks
-        task_idx = np.random.randint(2)
-        if task_idx == 0:
-            # Referring Grounding
-            prompt_list = self.manual_prompts["DocPretraining"]["referring_grounding"]
-            prompt_content = ref
-            answer = region
-        else:
-            # Grounded OCR
-            prompt_list = self.manual_prompts["DocPretraining"]["grounded_ocr"]
-            prompt_content = region
-            answer = ref
-
-        prompt_idx = np.random.randint(len(prompt_list))
-        cur_prompt = prompt_list[prompt_idx]
-        cur_prompt = cur_prompt.format(prompt_content)
-        if IMAGE_TOKEN not in cur_prompt:
-            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
-
-        return sample, cur_prompt, answer
-
-    def bbox_coord_to_label(self, text, bbox):
-        """Format bbox coordinates as text."""
-        assert len(bbox) == 4 or len(bbox) == 8
-
-        # Make sure there is no extra IMAGE_TOKEN tag
-        text = text.replace(IMAGE_TOKEN, "")
-
-        if len(bbox) == 4:
-            label_str = f"<ref>{text}</ref><box>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]})</box>"
-        else:
-            label_str = f"<ref>{text}</ref><quad>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]}),({bbox[4]},{bbox[5]}),({bbox[6]},{bbox[7]})</quad>"
-
-        return label_str
-
-    def encode_ocr_prompt(self, sample: OCRSample) -> ImageTaskSample:
-        """Encode OCR sample."""
-        if isinstance(sample.words_boxes[0], int):
-            answer = self.bbox_coord_to_label(sample.text, sample.words_boxes)
-        elif isinstance(sample.words_boxes[0], list):
-            answer = ""
-            for i, bbox in enumerate(sample.words_boxes):
-                answer += self.bbox_coord_to_label(sample.words_text[i], bbox)
-
-        prompt_list = self.manual_prompts["DocPretraining"]["ocr_multi"]
-        prompt_idx = np.random.randint(len(prompt_list))
-        cur_prompt = prompt_list[prompt_idx]
-
-        if IMAGE_TOKEN not in cur_prompt:
-            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
-        cur_answer = answer
-
-        return sample, cur_prompt, cur_answer
-
-    def batch(self, samples: List[Union[ImageTaskSample, ImageTaskSamplePacked]]) -> ImageTaskBatchPacked:
-        # Stack images to [num_tiles, c, h, w]. If there are no images (text-only), then use a dummy image.
-        imgs = [img for s in samples for img in s.imgs]
-        if len(imgs) > 0:
-            imgs = torch.stack(imgs)
-        else:
-            imgs = torch.tensor([[0]], dtype=torch.float32)
-
-        # If the user hasn't defined a target dataloader sequence length, then use the max along the sample lengths.
-        max_seq_len = self.dataloader_seq_length
-        if not max_seq_len:
-           max_seq_len = max(len(s.tokens) for s in samples)
-
-        tokens = np.full((len(samples), max_seq_len), self.tokenizer.pad, dtype=np.int64)
-        # +1 to accommodate shift to left by one later.
-        labels = np.full((len(samples), max_seq_len + 1), self.tokenizer.pad, dtype=np.int64)
-
-        for i, s in enumerate(samples):
-            # If the sample/target length exceeds the target sequence length, then truncate.
-            text_len = min(max_seq_len, len(s.tokens))
-            target_len = min(max_seq_len+1, len(s.labels))
-
-            tokens[i, :text_len] = s.tokens[:text_len]
-            labels[i, :target_len] = s.labels[:target_len]
-
-        num_tiles = torch.tensor([n for s in samples for n in s.num_tiles], dtype=torch.int32)
-        if len(num_tiles) == 0:
-            num_tiles = torch.tensor([[0]], dtype=torch.int32)
-
-        # Cumulative sample lengths are needed for packing, otherwise use dummy values.
-        cu_lengths = torch.tensor([[0]], dtype=torch.int32)
-        max_lengths = torch.tensor([[0]], dtype=torch.int32)
-
-        if self.is_packing_enabled:
-            cu_lengths = torch.stack([s.cu_lengths for s in samples])
-            max_lengths = torch.tensor([s.max_length for s in samples], dtype=torch.int32)
-
-        return ImageTaskBatchPacked(
-            __key__=[s.__key__ for s in samples],
-            __restore_key__=[s.__restore_key__ for s in samples],
-            __subflavor__=None,
-            __subflavors__=samples[0].__subflavors__,
-            tokens=tokens,
-            labels=labels,
-            imgs=imgs,
-            num_tiles=num_tiles,
-            cu_lengths=cu_lengths,
-            max_lengths=max_lengths,
-        )
-
-    def encode_batch(self, batch: ImageTaskBatchPacked) -> dict:
-        raw = dataclasses.asdict(batch)
-        del raw["__subflavors__"]
-        return raw
-
-    def select_samples_to_pack(self, samples: List[ImageTaskSample]) -> List[List[ImageTaskSample]]:
-        """Selects which samples will be packed together.
-
-        NOTE: Energon dataloader calls this method internally if packing is used.
-        Please see https://nvidia.github.io/Megatron-Energon/packing.html
-        """
-        lengths = [sample.total_len for sample in samples]
-
-        packed_samples = greedy_knapsack(lengths, samples, self.packing_seq_length)
-
-        return packed_samples
-
-    @stateless
-    def pack_selected_samples(self, samples: List[ImageTaskSample]) -> List[ImageTaskSamplePacked]:
-        """
-        Function to pack a list of ImageTaskSample into a single ImageTaskSamplePacked.
-
-        NOTE: Energon dataloader calls this method internally if packing is used.
-        Please see https://nvidia.github.io/Megatron-Energon/packing.html
-
-        Args:
-            samples: List of ImageTaskSample instances to pack into one sample.
-
-        Returns:
-            ImageTaskSamplePacked instance.
-        """
-        packing_seq_len = self.packing_seq_length
-
-        packed_tokens = []
-        packed_labels = []
-        packed_imgs = []
-
-        current_length = 0
-        max_length = 0
-        cu_lengths = [0]
-
-        # Process each sample and build lists that we will concatenate to create the packed sample.
-        for _, sample in enumerate(samples):
-            sample_len = sample.total_len
-
-            if sample_len > max_length:
-                max_length = sample_len
-
-            # If adding this sample exceeds the max length, stop.
-            # This should not happen. The select_samples_to_pack method should have already ensured that the samples fit.
-            if current_length + sample_len > packing_seq_len:
-                raise ValueError(f"Packed sample exceeds the maximum sequence length of {packing_seq_len}: {samples}")
-
-            # Add the sample's tokens and labels
-            packed_tokens.append(sample.tokens)
-            packed_labels.append(sample.labels)
-
-            # Add the images
-            packed_imgs += sample.imgs
-
-            current_length += sample_len
-            cu_lengths.append(current_length)
-
-        # Concatenate packed tokens and labels.
-        packed_tokens = torch.cat(packed_tokens, dim=0)
-        packed_labels = torch.cat(packed_labels, dim=0)
-
-        return ImageTaskSamplePacked(
-            __key__=",".join([s.__key__ for s in samples]),
-            __restore_key__=(),  # Will be set by energon based on `samples`
-            __subflavor__=None,
-            __subflavors__=samples[0].__subflavors__,
-            tokens=packed_tokens,
-            labels=packed_labels,
-            imgs=packed_imgs,
-            cu_lengths=torch.tensor(cu_lengths, dtype=torch.int32),
-            max_length=max_length,
-            num_tiles=[n for s in samples for n in s.num_tiles],
-        )
-
-
-def print_error_handler(exc: Exception, key: Optional[str]):
-    print(
-        f"The following exception occurred in the dataloader for sample {key} and is skipped",
-        file=sys.stderr,
-    )
-    traceback.print_exc()
-
-
-def format_multichoice_question(question, multichoice_options):
-    """Format multi-choice question."""
-    options_text = ["{}. {}\n".format(chr(ord('A') + i), option) for i, option in
-                    zip(range(len(multichoice_options)), multichoice_options)]
-    options_text = "".join(options_text)
-
-    options_text = f"{options_text}Answer with the option's letter from the given choices directly."
-
-    return "{}\n{}".format(question, options_text)
-
-
-def format_multichoice_answer(idx):
-    """Format multi-choice answer."""
-    return chr(ord('A') + idx)
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import bisect
+import dataclasses
+import json
+import re
+import sys
+import traceback
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+from image_processing import find_closest_aspect_ratio, find_closest_area_weighted_aspect_ratio, get_visual_transform
+from PIL import Image
+from torchvision.transforms import ToPILImage
+import numpy as np
+import torch
+
+from energon_util import OfflineTargetAspectRatioSample, SampleListSample
+from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN, VIDEO_TOKEN
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.energon import (
+    Batch,
+    CaptioningSample,
+    DefaultTaskEncoder,
+    OCRSample,
+    Sample,
+    SimilarityInterleavedSample,
+    VQASample,
+    MultiChoiceVQASample
+)
+from megatron.energon.task_encoder.base import stateless
+from megatron.training import get_args, get_tokenizer
+
+
+@dataclass
+class ImageTaskSample(Sample):
+    __key__: str
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict
+    __subflavors__: Dict
+    # (c, h, w)
+    imgs: List[torch.Tensor]
+    num_tiles: List[int]
+    tokens: torch.Tensor
+    total_len: int  # Total token count in the sample, including text and image tokens
+    labels: torch.Tensor = None
+
+
+@dataclass
+class ImageTaskSamplePacked(Sample):
+    """Dataclass to store a single packed sample (not a batch).
+
+        P = Number of sub-samples in the packed sample
+        seq_len = Total sequence length
+        num_imgs = Number of images across all samples in the packed sample
+    """
+
+    __key__: str    # Sample name
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict     # Sample metadata. Deprecated.
+    __subflavors__: Dict    # Sample metadata.
+    tokens: torch.Tensor  # Input tokens packed into a single tensor (seq_len,)
+    labels: torch.Tensor # Target tokens packed into a single tensor (seq_len,)
+    imgs: List[torch.Tensor]    # Input images
+    num_tiles: List[int]  # Number of tiles for each image of each sample (num_imgs)
+    max_length: int    # Maximum length across sub-samples.
+    cu_lengths: List[int]  # Cumulative length of each sub-sample in this packed sample incl. text and image tokens (P,)
+
+
+# Typing for the resulting batch data after encode_batch()
+@dataclass
+class ImageTaskBatchPacked(Batch):
+    """Dataclass to store a batch of packed samples.
+
+        N = Batch size
+        P = Number of samples in the packed sample
+        seq_len = Maximum sequence length
+        num_imgs = Number of images across all samples in the packed sample
+    """
+
+    __key__: List[str]  # Sample names
+    __restore_key__: Tuple[Union[str, int, tuple], ...]
+    __subflavor__: Dict     # Sample metadata. Deprecated.
+    __subflavors__: List[Dict]  # Sample metadatas.
+    tokens: torch.Tensor  # Input tokens packed and padded (N, seq_len)
+    labels: torch.Tensor # Target tokens packed and padded (N, seq_len)
+    imgs: torch.Tensor  # All image tiles stacked into a single tensor (num_tiles, C, H, W)
+    num_tiles: List[List[int]]  # Number of tiles per image (N, num_imgs)
+    max_lengths: List[int]  # Maximum length across sub-samples (N,)
+    cu_lengths: List[List[int]]  # Cumulative length of each sub-sample in each packed sample of the batch (N, P)
+
+
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L19
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def search_for_fit(numbers: List[int], capacity: int) -> int:
+    """Finds the index of largest number that fits into the knapsack with the given capacity."""
+    index = bisect.bisect(numbers, capacity)
+    return -1 if index == 0 else (index - 1)
+
+
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L27
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: int) -> List:
+    """Greedy algorithm with binary search for the knapsack problem.
+
+    Pack as many samples as possible given a maximum capacity and capacities of individual samples.
+    Used if sequence packing is enabled.
+    """
+    assert len(item_sizes) == len(samples), "sample lengths and samples must have the same length."
+
+    knapsacks = []
+
+    if len(item_sizes) == 0:
+        return knapsacks
+
+    # Sort sample lengths and samples together.
+    sorted_item_sizes, sorted_samples = zip(*sorted(zip(item_sizes, samples), key=lambda x: x[0]))
+    sorted_item_sizes = list(sorted_item_sizes)
+    sorted_samples = list(sorted_samples)
+
+    # Check if all samples fit in the knapsack capacity.
+    if sorted_item_sizes[-1] > max_capacity:
+        raise ValueError(f"knapsack: A sample is larger {sorted_item_sizes[-1]} than the max_sequence_length {max_capacity}.")
+
+    while sorted_item_sizes:
+        current_knapsack = []
+        remaining_capacity = max_capacity
+
+        while True:
+            idx = search_for_fit(sorted_item_sizes, remaining_capacity)
+            if idx == -1:
+                break   # Can't fit more samples.
+
+            remaining_capacity -= sorted_item_sizes[idx]
+
+            sorted_item_sizes.pop(idx)
+            sample = sorted_samples.pop(idx)
+            current_knapsack.append(sample)
+
+        knapsacks.append(current_knapsack)
+
+    return knapsacks
+
+
+class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, dict]):
+    """A simple task encoder for VLMs."""
+
+    def __init__(
+        self
+    ):
+        super().__init__()
+
+        self.args = get_args()
+
+        self.tokenizer = get_tokenizer()
+        with open(self.args.prompt_path, "r") as f:
+            self.manual_prompts = json.load(f)
+        self.dataloader_seq_length = self.args.dataloader_seq_length  # Always return samples of this length.
+        self.packing_seq_length = self.args.packing_seq_length     # Packing sequence length, if packing is enabled.
+        self.is_packing_enabled = self.args.packing_buffer_size is not None and self.args.packing_buffer_size > 0
+
+        if self.dataloader_seq_length and self.packing_seq_length:
+            assert self.dataloader_seq_length >= self.packing_seq_length, "dataloader sequence length must be greater than or equal to the packing sequence length"
+
+        if self.is_packing_enabled:
+            assert self.packing_seq_length > 0, "packing sequence length must be set"
+
+        self.num_image_embeddings_per_tile = get_num_image_embeddings(
+            self.args.img_h,
+            self.args.img_w,
+            self.args.patch_dim,
+            self.args.vision_model_type,
+            self.args.disable_vision_class_token,
+            1,
+            self.args.pixel_shuffle,
+            self.args.use_tile_tags,
+        )
+
+        self.txt_to_token_dict = {}
+
+        self.img_h, self.img_w = self.args.img_h, self.args.img_w
+        self.img_token_id = self.tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        # This map is used to reduce the number of tiles used per image if the number of tokens is
+        # larger than the decoder_seq_length.
+        self.num_tiles_degradation_map = {12:8, 8:6, 6:4, 4:2, 2:1, 1:1}
+
+        self.find_closest_aspect_ratio_fn = (
+            find_closest_area_weighted_aspect_ratio if self.args.use_area_weighted_aspect_ratio
+            else find_closest_aspect_ratio)
+
+    def _get_total_seq_length(self, input_ids, num_tiles):
+        """Calculate expected sequence length given text tokens length and number of tiles."""
+        total_num_images = len(num_tiles)
+        total_num_tiles = sum(num_tiles)
+        total_len = len(input_ids) + total_num_tiles * self.num_image_embeddings_per_tile - total_num_images
+        return total_len
+
+    def _truncate_for_packing(self, input_ids, target, num_tiles):
+        """Truncate tokens and labels if they exceed packing sequence length."""
+        total_num_images = len(num_tiles)
+        total_num_tiles = sum(num_tiles)
+        total_img_embeddings_len = total_num_tiles * self.num_image_embeddings_per_tile
+        max_text_tokens = self.packing_seq_length - total_img_embeddings_len + total_num_images
+
+        input_ids = input_ids[:max_text_tokens]
+        target = target[:max_text_tokens]
+
+        # If truncate causes all labels to be ignored, then skip the sample
+        if (target == IGNORE_INDEX).all():
+            raise ValueError(f"all targets will be ignored after truncation: {input_ids}")
+
+        return input_ids, target
+
+    @stateless(restore_seeds=True)
+    def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQASample, SimilarityInterleavedSample]):
+        if isinstance(sample, OCRSample):
+            if "pdfa" in sample.__key__:
+                yield self.combined_ocr_encoder(sample, task_type='encode_pdf')
+            elif "multi" in sample.__key__:
+                yield self.combined_ocr_encoder(sample, task_type='_encode_ocr')
+            else:
+                yield self.combined_ocr_encoder(sample, task_type='encode_ocr_ref')
+        elif isinstance(sample, CaptioningSample):
+            yield self.encode_captioning(sample)
+        elif isinstance(sample, VQASample):
+            is_llava_training = sample.__subflavors__["is_llava_training"] if "is_llava_training" in sample.__subflavors__ else False
+
+            if "llava" in sample.__key__ or is_llava_training:
+                yield self.encode_llava_pretrain(sample)
+            else:
+                yield self.encode_any_single_turn_vqa(sample)
+        elif isinstance(sample, SimilarityInterleavedSample):
+            yield self.encode_llava_sft(sample)
+        elif isinstance(sample, MultiChoiceVQASample):
+            yield self.encode_any_single_turn_vqa(sample)
+        # Because the SampleListSample is defined in the Megatron module but loaded by the Energon
+        # library, we need to resort to the more brittle check:
+        elif type(sample).__name__ == "SampleListSample":
+            yield self.encode_sample_list(sample)
+        else:
+            raise NotImplementedError("Sample format not supported", sample)
+
+    def encode_captioning(self, sample: CaptioningSample):
+        """Encode CaptioningSample."""
+        augment = sample.__subflavors__.get("augmentation")
+
+        imgs = get_visual_transform(
+            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+            self.args.vision_model_type, find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+        )
+        num_tiles = [len(imgs)]
+
+        prompt_list = self.manual_prompts["CaptioningPretraining"]["raw"]
+
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt + "\n"
+
+        caption = sample.caption.strip()
+
+        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
+        if split_by_line_flag:
+            caption_list = caption.split('\n')
+            caption = np.random.choice(caption_list)
+
+        conv = [
+            # Note: no system message.
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": caption},
+        ]
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def encode_llava_pretrain(self, sample: VQASample):
+        """Encode pretrain sample in LLAVA style."""
+        augment = sample.__subflavors__.get("augmentation", False)
+
+        imgs = get_visual_transform(
+            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+            self.args.vision_model_type, find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+        )
+        num_tiles = [len(imgs)]
+
+        # LLAVA training: override text-prompt with just the image.
+        conv = [
+            # Note: no system message.
+            {"role": "user", "content": IMAGE_TOKEN + "\n"},
+            {"role": "assistant", "content": sample.answers},
+        ]
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def encode_sample_list(self, samples: SampleListSample):
+        """We encode the list of samples using encode_llava_sft on each sample."""
+        error_msg = ("You probably don't want to use online packing since SampleListSample is "
+                     "usually used along offline packing.")
+        assert not self.is_packing_enabled, error_msg
+        encoded_samples = []
+        current_length = 0
+        for sample in samples.samples:
+            encoded_sample = self.encode_llava_sft(sample, truncate_for_sample_list_packing=True)
+            if current_length + encoded_sample.total_len > self.packing_seq_length:
+                break
+            else:
+                encoded_samples.append(encoded_sample)
+                current_length += encoded_sample.total_len
+        return self.pack_selected_samples(encoded_samples)
+
+    def encode_llava_sft(self, sample: Union[SimilarityInterleavedSample, OfflineTargetAspectRatioSample], truncate_for_sample_list_packing=False):
+        """Encode SFT sample."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
+
+        # If the target aspect ratio are provided by the dataset, we use them instead of computing
+        # them with the self.find_closest_aspect_ratio_fn function.
+        local_find_closest_aspect_ratio_fn = self.find_closest_aspect_ratio_fn
+        if type(sample).__name__ == "OfflineTargetAspectRatioSample":
+            target_aspect_ratio = tuple(sample.target_aspect_ratio[0])
+            assert target_aspect_ratio is not None, "Sample of type OfflineTargetAspectRatioSample needs to define the target aspect ratio."
+            local_find_closest_aspect_ratio_fn = lambda *args, **kwargs: target_aspect_ratio
+
+        has_image = False
+        # We infer whether the sample has image or not.
+        if hasattr(sample, "images") and not has_video:
+            # If this is a text-only sample and we are freezing the LM,
+            # then use a dummy input image.
+            if len(sample.images) == 0 and self.args.freeze_LM:
+                empty_img = Image.new('RGB', (self.args.img_w, self.args.img_h), (255, 255, 255))
+                sample.images.append(empty_img)
+            if len(sample.images) > 0:
+                has_image = True
+
+        # Note: Some tokenizers may ignore the system prompt.
+        conversation = [{"role": "system", "content": "Answer the questions."}]
+        # Format the conversation as a list of "user" / "assistant" turns.
+        for text in sample.texts:
+            error_msg = f"unexpected role {text['from']} in {sample.texts}"
+            assert text["from"] in ["human", "gpt"], error_msg
+            conversation.append({
+                "role": "user" if text["from"] == "human" else "assistant",
+                "content": text["value"]})
+
+        # Replace the image tags <image-idx> with IMAGE_TOKEN and count the number of image tags
+        number_image_tags = 0
+        image_tag_ids_list = []
+        for turn in conversation:
+            if turn["role"] == "user":
+                image_tag_ids = [int(x) - 1 for x in re.findall(r"<image-(\d+)>", turn["content"])]
+                image_tag_ids_list.extend(image_tag_ids)
+                turn["content"] = re.sub(r"<image-\d+>", IMAGE_TOKEN, turn["content"])
+                # For videos, we use the image token to locate where to put the frames.
+                if has_video:
+                    turn["content"] = turn["content"].replace(VIDEO_TOKEN, IMAGE_TOKEN)
+                number_image_tags += turn["content"].count(IMAGE_TOKEN)
+
+        # We re-order the images in sample.images according to how they appear in the conversation.
+        if len(image_tag_ids_list) > 0:
+            sample.images = [sample.images[idx] for idx in image_tag_ids_list]
+
+        # If there is only one image, but several image tags, we assume all the tags refer to the
+        # same image and duplicate the image:
+        if not has_video and len(sample.images) == 1 and number_image_tags > 1:
+            sample.images = sample.images * number_image_tags
+
+        # We currently only support one video per sample.
+        number_of_images = 1 if has_video else len(sample.images)
+        # Fail if there are more image or video tags than image or videos:
+        error_msg = (
+            f"Found {number_image_tags} image tags for {number_of_images} images. {sample.texts}")
+        assert number_image_tags <= number_of_images, error_msg
+
+        # If there are less image of video tags than image or videos, prepend the tags to the first
+        # user message:
+        if number_image_tags < number_of_images:
+            for turn in conversation:
+                if turn["role"] == "user":
+                    turn["content"] = IMAGE_TOKEN*(number_of_images-number_image_tags) + "\n" + turn["content"]
+                    break
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+
+        if has_image:
+            imgs = []
+            num_tiles = []
+            max_num_tiles = self.args.max_num_tiles
+            # We keep a buffer of 4 tokens for the question,
+            # the rest can be used for image tokens.
+            max_image_token_allowed = self.args.decoder_seq_length - len(input_ids) - 4
+            # We start by extracting as many tiles per image as possible, and decrease the max
+            # number of tiles if there are too many image tokens.
+            while True:
+                imgs = []
+                num_tiles = []
+                for img in sample.images:
+                    img_tiles = get_visual_transform(
+                        img, self.img_h, self.img_w, self.args.use_tiling, max_num_tiles,
+                        self.args.use_thumbnail, augment, self.args.vision_model_type,
+                        find_closest_aspect_ratio_fn=local_find_closest_aspect_ratio_fn)
+                    imgs += img_tiles
+                    num_tiles += [len(img_tiles)]
+                if max_num_tiles == 1:
+                    break
+                if sum(num_tiles) * self.num_image_embeddings_per_tile > max_image_token_allowed:
+                    if max_num_tiles in self.num_tiles_degradation_map:
+                        max_num_tiles = self.num_tiles_degradation_map[max_num_tiles]
+                    else:
+                        raise RuntimeError((
+                            f"Tried to decrease the number of tiles {max_num_tiles} but it's not ",
+                            f"defined in the degradation map {self.num_tiles_degradation_map}"))
+                else:
+                    break
+        elif has_video:
+            # We don't use tiling for videos to limit the number of tokens.
+            use_tiling=False
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, num_channels, height, width).
+            video_fchw = sample.images.frames
+            if video_fchw.shape[0] == 0:
+                raise ValueError(f"Video {sample.__key__} {sample.__restore_key__} {sample.texts} has no frames.")
+            selected_frames = torch.linspace(
+                0, video_fchw.shape[0] - 1, self.args.num_frames).long()
+            video_fchw = video_fchw[selected_frames]
+            imgs = []
+            for video_chw in video_fchw:
+                to_pil = ToPILImage()
+                video_chw = to_pil(video_chw)
+                imgs += get_visual_transform(
+                    video_chw, self.img_h, self.img_w, use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment, self.args.vision_model_type,
+                    find_closest_aspect_ratio_fn=local_find_closest_aspect_ratio_fn)
+            num_tiles = [len(imgs)]
+        else:
+            imgs = num_tiles = []
+
+        if self.is_packing_enabled or truncate_for_sample_list_packing:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        # Some final checks with respect to the number of image tokens and images on the tokenized
+        # conversation. There can still be errors, for instance if a non-video sample happens to
+        # have our pre-defined video token, or if the packing truncation removed a necessary image
+        # tag.
+        number_image_token = np.sum(input_ids == self.img_token_id)
+        error_msg = (
+            f"Found {number_image_token} image tokens for len({num_tiles}) = {len(num_tiles)} image tiles in {conversation}.")
+        assert number_image_token == len(num_tiles), error_msg
+        error_msg = (
+            f"Found sum({num_tiles}) = {np.sum(num_tiles)} tiles for {len(imgs)} images in {conversation}.")
+        assert np.sum(num_tiles) == len(imgs), error_msg
+
+        # We need to ensure that there are at least some trainable tokens in the sample.
+        assert self.target_has_trainable_tokens(input_ids, num_tiles, target), "Sample has no trainable tokens."
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+    
+    def target_has_trainable_tokens(self, input_ids, num_tiles, target):
+        # Compute the loss mask based on extending the image tags with the proper
+        # number of image tokens, extracting the first self.args.decoder_seq_length tokens, and
+        # ensuring that some of these tokens have a loss mask > 0.
+        # Note that this is a bit hacky because we reproduce here parts of the logics which are in
+        # the model itself. Ideally, the data sampler would return the already processed inputs
+        # and targets to avoid this duplication.
+        expanded_target = target.copy()
+        expanded_target[input_ids==self.img_token_id] = self.img_token_id
+        expanded_target = self.replace_value_with_repetition(
+            expanded_target, self.img_token_id,
+            self.num_image_embeddings_per_tile * np.array(num_tiles), IGNORE_INDEX)
+        loss_mask = torch.ones(torch.tensor(expanded_target).size(), dtype=torch.float)
+        loss_mask[expanded_target == self.tokenizer.pad] = 0.0 # mask paddings
+        loss_mask[expanded_target == IGNORE_INDEX] = 0.0 # mask prompts
+        loss_mask = torch.cat((loss_mask[1:], torch.zeros((1,))))
+        loss_mask = loss_mask[:self.args.decoder_seq_length]
+        return torch.sum(loss_mask) > 0
+
+    def replace_value_with_repetition(self, arr, token_to_replace, num_repetition, new_token):
+        """
+        Replace every occurrence of value V in the input array with R repetitions of W.
+
+        Args:
+            arr (Array): Input array to be modified
+            token_to_replace: token to be replaced
+            new_token: new token
+            num_repetition (Array): number of repetition of new token.
+
+        Returns:
+            Array: New array with token_to_replace replaced by num_repetition repetitions of
+             new_token
+        """
+        error_msg = "The number of image tokens must match the length of the tile tensor."
+        assert np.sum(arr==token_to_replace) == len(num_repetition), error_msg
+        result = []
+        idx = 0
+        for item in arr:
+            if item == token_to_replace:
+                # If the current item matches token_to_replace, add R copies of W
+                result.extend([new_token] * num_repetition[idx])
+                idx += 1
+            else:
+                # Otherwise, keep the original item
+                result.append(item)
+
+        return np.array(result)
+
+    def encode_any_single_turn_vqa(self, sample):
+        """Encode MultiChoiceVQA or VQA sample."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
+
+        if has_video:
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, height, width, num_channels).
+            video_fhwc = sample.image.permute(0, 2, 3, 1)
+            selected_frames = torch.linspace(
+                0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
+            video_frame_fhwc = video_fhwc[selected_frames]
+            imgs = []
+            for video_frame_hwc in video_frame_fhwc:
+                imgs += get_visual_transform(
+                    video_frame_hwc, self.img_h, self.img_w,
+                    self.args.use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment, self.args.vision_model_type,
+                    find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn)
+        else:
+            imgs = get_visual_transform(
+                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
+                self.args.use_thumbnail, augment, self.args.vision_model_type,
+                find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+            )
+
+        num_tiles = [len(imgs)]
+
+        if isinstance(sample, MultiChoiceVQASample):
+            cur_prompt = format_multichoice_question(sample.context, sample.choices)
+            if IMAGE_TOKEN not in cur_prompt:
+                cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+            cur_answer = format_multichoice_answer(sample.correct_choice_idx)
+        elif isinstance(sample, VQASample):
+            if 'docvqa' in sample.__key__:
+                prompt_list = self.manual_prompts["VQASFT"]["docvqa"]
+            elif sample.__subflavors__.get("VQASFT"):
+                prompt_list = self.manual_prompts["VQASFT"]["raw"]
+            else:
+                prompt_list = ["{}"]
+
+            prompt_idx = np.random.randint(len(prompt_list))
+            cur_prompt = prompt_list[prompt_idx]
+
+            cur_prompt = cur_prompt.format(sample.context)
+
+            if IMAGE_TOKEN not in cur_prompt:
+                cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+
+            if isinstance(sample.answers, list):
+                answer_list = sample.answers
+                weight_list = np.array(sample.answer_weights).astype(np.float32)
+                weight_list = weight_list / np.sum(weight_list)
+                answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0]
+                cur_answer = answer_list[answer_idx]
+            else:
+                cur_answer = sample.answers
+        else:
+            raise NotImplementedError("Unsupported data type provided", sample)
+
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": str(cur_answer)},
+        ]
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def combined_ocr_encoder(self, sample, task_type):
+        """Encode OCR samples."""
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+
+        if task_type == "encode_pdf":
+            sample, cur_prompt, cur_answer = self.encode_pdf_prompt(sample)
+        elif task_type == "encode_ocr_ref":
+            sample, cur_prompt, cur_answer = self.encode_ocr_ref_prompt(sample)
+        elif task_type == "_encode_ocr":
+            sample, cur_prompt, cur_answer = self.encode_ocr_prompt(sample)
+
+        imgs = get_visual_transform(
+                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
+                self.args.use_thumbnail, augment, self.args.vision_model_type,
+                find_closest_aspect_ratio_fn=self.find_closest_aspect_ratio_fn
+            )
+        num_tiles = [len(imgs)]
+
+        conversation = [
+            {"role": "system", "content": "Answer the questions."},
+            {"role": "user", "content": cur_prompt},
+            {"role": "assistant", "content": str(cur_answer)},
+        ]
+
+        input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
+
+        if self.is_packing_enabled:
+            input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __restore_key__=sample.__restore_key__,
+            __subflavor__=None,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            tokens=torch.tensor(input_ids),
+            labels=torch.tensor(target),
+            total_len=self._get_total_seq_length(input_ids, num_tiles),
+        )
+
+    def encode_pdf_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        prompt_list = self.manual_prompts["DocPretraining"]["raw"]
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+
+        # Make sure there is no extra IMAGE_TOKEN tag.
+        sample.text = sample.text.replace(IMAGE_TOKEN, "")
+
+        caption = sample.text.strip()
+
+        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
+        if split_by_line_flag:
+            caption_list = caption.split('\n')
+            caption = np.random.choice(caption_list)
+        cur_answer = caption
+
+        return sample, cur_prompt, cur_answer
+
+    def encode_ocr_ref_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        ref = sample.text
+        region = sample.words_boxes
+
+        # Make sure there is no extra IMAGE_TOKEN tag
+        ref = ref.replace(IMAGE_TOKEN, "")
+
+        if len(region) == 4:
+            region = f"<box>({region[0]},{region[1]}),({region[2]},{region[3]})</box>"
+        else:
+            region = f"<quad>({region[0]},{region[1]}),({region[2]},{region[3]}),({region[4]},{region[5]}),({region[6]},{region[7]})</quad>"
+
+        # Randomly choose between two tasks
+        task_idx = np.random.randint(2)
+        if task_idx == 0:
+            # Referring Grounding
+            prompt_list = self.manual_prompts["DocPretraining"]["referring_grounding"]
+            prompt_content = ref
+            answer = region
+        else:
+            # Grounded OCR
+            prompt_list = self.manual_prompts["DocPretraining"]["grounded_ocr"]
+            prompt_content = region
+            answer = ref
+
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        cur_prompt = cur_prompt.format(prompt_content)
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+
+        return sample, cur_prompt, answer
+
+    def bbox_coord_to_label(self, text, bbox):
+        """Format bbox coordinates as text."""
+        assert len(bbox) == 4 or len(bbox) == 8
+
+        # Make sure there is no extra IMAGE_TOKEN tag
+        text = text.replace(IMAGE_TOKEN, "")
+
+        if len(bbox) == 4:
+            label_str = f"<ref>{text}</ref><box>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]})</box>"
+        else:
+            label_str = f"<ref>{text}</ref><quad>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]}),({bbox[4]},{bbox[5]}),({bbox[6]},{bbox[7]})</quad>"
+
+        return label_str
+
+    def encode_ocr_prompt(self, sample: OCRSample) -> ImageTaskSample:
+        """Encode OCR sample."""
+        if isinstance(sample.words_boxes[0], int):
+            answer = self.bbox_coord_to_label(sample.text, sample.words_boxes)
+        elif isinstance(sample.words_boxes[0], list):
+            answer = ""
+            for i, bbox in enumerate(sample.words_boxes):
+                answer += self.bbox_coord_to_label(sample.words_text[i], bbox)
+
+        prompt_list = self.manual_prompts["DocPretraining"]["ocr_multi"]
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
+        cur_answer = answer
+
+        return sample, cur_prompt, cur_answer
+
+    def batch(self, samples: List[Union[ImageTaskSample, ImageTaskSamplePacked]]) -> ImageTaskBatchPacked:
+        # Stack images to [num_tiles, c, h, w]. If there are no images (text-only), then use a dummy image.
+        imgs = [img for s in samples for img in s.imgs]
+        if len(imgs) > 0:
+            imgs = torch.stack(imgs)
+        else:
+            imgs = torch.tensor([[0]], dtype=torch.float32)
+
+        # If the user hasn't defined a target dataloader sequence length, then use the max along the sample lengths.
+        max_seq_len = self.dataloader_seq_length
+        if not max_seq_len:
+           max_seq_len = max(len(s.tokens) for s in samples)
+
+        tokens = np.full((len(samples), max_seq_len), self.tokenizer.pad, dtype=np.int64)
+        # +1 to accommodate shift to left by one later.
+        labels = np.full((len(samples), max_seq_len + 1), self.tokenizer.pad, dtype=np.int64)
+
+        for i, s in enumerate(samples):
+            # If the sample/target length exceeds the target sequence length, then truncate.
+            text_len = min(max_seq_len, len(s.tokens))
+            target_len = min(max_seq_len+1, len(s.labels))
+
+            tokens[i, :text_len] = s.tokens[:text_len]
+            labels[i, :target_len] = s.labels[:target_len]
+
+        num_tiles = torch.tensor([n for s in samples for n in s.num_tiles], dtype=torch.int32)
+        if len(num_tiles) == 0:
+            num_tiles = torch.tensor([[0]], dtype=torch.int32)
+
+        # Cumulative sample lengths are needed for packing, otherwise use dummy values.
+        cu_lengths = torch.tensor([[0]], dtype=torch.int32)
+        max_lengths = torch.tensor([[0]], dtype=torch.int32)
+
+        if self.is_packing_enabled:
+            cu_lengths = torch.stack([s.cu_lengths for s in samples])
+            max_lengths = torch.tensor([s.max_length for s in samples], dtype=torch.int32)
+
+        return ImageTaskBatchPacked(
+            __key__=[s.__key__ for s in samples],
+            __restore_key__=[s.__restore_key__ for s in samples],
+            __subflavor__=None,
+            __subflavors__=samples[0].__subflavors__,
+            tokens=tokens,
+            labels=labels,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            cu_lengths=cu_lengths,
+            max_lengths=max_lengths,
+        )
+
+    def encode_batch(self, batch: ImageTaskBatchPacked) -> dict:
+        raw = dataclasses.asdict(batch)
+        del raw["__subflavors__"]
+        return raw
+
+    def select_samples_to_pack(self, samples: List[ImageTaskSample]) -> List[List[ImageTaskSample]]:
+        """Selects which samples will be packed together.
+
+        NOTE: Energon dataloader calls this method internally if packing is used.
+        Please see https://nvidia.github.io/Megatron-Energon/packing.html
+        """
+        lengths = [sample.total_len for sample in samples]
+
+        packed_samples = greedy_knapsack(lengths, samples, self.packing_seq_length)
+
+        return packed_samples
+
+    @stateless
+    def pack_selected_samples(self, samples: List[ImageTaskSample]) -> List[ImageTaskSamplePacked]:
+        """
+        Function to pack a list of ImageTaskSample into a single ImageTaskSamplePacked.
+
+        NOTE: Energon dataloader calls this method internally if packing is used.
+        Please see https://nvidia.github.io/Megatron-Energon/packing.html
+
+        Args:
+            samples: List of ImageTaskSample instances to pack into one sample.
+
+        Returns:
+            ImageTaskSamplePacked instance.
+        """
+        packing_seq_len = self.packing_seq_length
+
+        packed_tokens = []
+        packed_labels = []
+        packed_imgs = []
+
+        current_length = 0
+        max_length = 0
+        cu_lengths = [0]
+
+        # Process each sample and build lists that we will concatenate to create the packed sample.
+        for _, sample in enumerate(samples):
+            sample_len = sample.total_len
+
+            if sample_len > max_length:
+                max_length = sample_len
+
+            # If adding this sample exceeds the max length, stop.
+            # This should not happen. The select_samples_to_pack method should have already ensured that the samples fit.
+            if current_length + sample_len > packing_seq_len:
+                raise ValueError(f"Packed sample exceeds the maximum sequence length of {packing_seq_len}: {samples}")
+
+            # Add the sample's tokens and labels
+            packed_tokens.append(sample.tokens)
+            packed_labels.append(sample.labels)
+
+            # Add the images
+            packed_imgs += sample.imgs
+
+            current_length += sample_len
+            cu_lengths.append(current_length)
+
+        # Concatenate packed tokens and labels.
+        packed_tokens = torch.cat(packed_tokens, dim=0)
+        packed_labels = torch.cat(packed_labels, dim=0)
+
+        return ImageTaskSamplePacked(
+            __key__=",".join([s.__key__ for s in samples]),
+            __restore_key__=(),  # Will be set by energon based on `samples`
+            __subflavor__=None,
+            __subflavors__=samples[0].__subflavors__,
+            tokens=packed_tokens,
+            labels=packed_labels,
+            imgs=packed_imgs,
+            cu_lengths=torch.tensor(cu_lengths, dtype=torch.int32),
+            max_length=max_length,
+            num_tiles=[n for s in samples for n in s.num_tiles],
+        )
+
+
+def print_error_handler(exc: Exception, key: Optional[str]):
+    print(
+        f"The following exception occurred in the dataloader for sample {key} and is skipped",
+        file=sys.stderr,
+    )
+    traceback.print_exc()
+
+
+def format_multichoice_question(question, multichoice_options):
+    """Format multi-choice question."""
+    options_text = ["{}. {}\n".format(chr(ord('A') + i), option) for i, option in
+                    zip(range(len(multichoice_options)), multichoice_options)]
+    options_text = "".join(options_text)
+
+    options_text = f"{options_text}Answer with the option's letter from the given choices directly."
+
+    return "{}\n{}".format(question, options_text)
+
+
+def format_multichoice_answer(idx):
+    """Format multi-choice answer."""
+    return chr(ord('A') + idx)
--- a/examples/multimodal/energon_util.py
+++ b/examples/multimodal/energon_util.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import torch
+import warnings
+from dataclasses import dataclass
+from typing import Any, List
+
+from megatron.energon import Sample
+from megatron.energon.epathlib.epath import EPath
+from megatron.energon.flavors.webdataset import DefaultDecoderWebdatasetFactory
+
+
+@dataclass
+class SampleListSample(Sample):
+    """Sample type for a list of samples of any type which needs to be packed together.
+    
+    This is useful for datasets which are packed offline.
+    """
+
+    #: The images of the sequence
+    samples: List[Any]
+
+
+class SampleListWebdataset(DefaultDecoderWebdatasetFactory[SampleListSample]):
+    __sample_type__ = SampleListSample
+
+    def __init__(self, path: EPath, **kwargs):
+        warnings.warn(
+            f"{type(self)} is deprecated, use the default instead and set the sample_type:\n"
+            f"To convert, update your {path}/.nv-meta/dataset.yaml to:\n"
+            f"# remove top-level __module__ and __class__\n"
+            f"sample_type:\n"
+            f"  __module__: megatron.energon\n"
+            f"  __class__: {self.__sample_type__.__name__}\n"
+            f"# Keep the remaining content",
+            DeprecationWarning,
+        )
+        super().__init__(path, **kwargs)
+
+
+@dataclass
+class OfflineTargetAspectRatioSample(Sample):
+    """Sample type for image + text samples with target aspect ratio computed offline."""
+
+    #: The images of the sequence
+    images: List[torch.Tensor]
+    #: The texts of the sequence
+    texts: List[str]
+    target_aspect_ratio: List[List]
--- a/examples/multimodal/evaluation/evaluate_infovqa.py
+++ b/examples/multimodal/evaluation/evaluate_infovqa.py
+import argparse
+import json
+
+from evaluate_vqav2 import compute_vqa_accuracy
+from evaluate_mmmu import get_input_output_paths
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="InfoVQA")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(
+                    {
+                        "question_id": res["sample_id"],
+                        "answer": res["answer"],
+                        "gt_answer": res["gt_answer"],
+                    }
+                )
+
+    # Make order deterministic.
+    # results = sorted(results, key=lambda d: d["question_id"])
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def infovqa_eval(input_path):
+    """Run InfoVQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    return compute_vqa_accuracy(result_file_path, task="InfoVQA")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = infovqa_eval(args.input_path)
+
+    print(f"===== InfoVQA Accuracy {avg_acc:.2f}% =====")
--- a/examples/multimodal/evaluation/evaluate_spdocvqa.py
+++ b/examples/multimodal/evaluation/evaluate_spdocvqa.py
+import argparse
+import json
+
+from evaluate_vqav2 import compute_vqa_accuracy
+from evaluate_mmmu import get_input_output_paths
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="SPDocVQA")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(
+                    {
+                        "question_id": res["sample_id"],
+                        "answer": res["answer"],
+                        "gt_answer": res["gt_answer"],
+                    }
+                )
+
+    # Make order deterministic.
+    # results = sorted(results, key=lambda d: d["question_id"])
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def spdocvqa_eval(input_path):
+    """Run SPDocVQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    return compute_vqa_accuracy(result_file_path, task="SPDocVQA")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = spdocvqa_eval(args.input_path)
+
+    print(f"===== SPDocVQA Accuracy {avg_acc:.2f}% =====")
--- a/examples/multimodal/evaluation/evaluate_vqav2.py
+++ b/examples/multimodal/evaluation/evaluate_vqav2.py
-import argparse
-import json
-
-from evaluate_mmmu import get_input_output_paths
-from open_flamingo.eval.vqa_metric import VQAEval
-
-
-def merge_input_files(input_path):
-    """Merge input files to a format compatible with the evaluator."""
-    input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
-
-    results = dict()
-
-    for input_file_path in input_file_paths:
-        with open(input_file_path, "r") as input_file:
-            for line in input_file:
-                res = json.loads(line)
-                sample_id = res["sample_id"]
-
-                # Skip possible duplicates.
-                if sample_id in results:
-                    continue
-
-                res["question_id"] = sample_id
-                results[sample_id] = res
-
-    results = list(results.values())
-
-    with open(output_file_path, "w") as output_file:
-        json.dump(results, output_file)
-
-    return output_file_path
-
-
-def is_number(n: str):
-    """Check if input is a number."""
-    try:
-        float(n)
-        return True
-    except ValueError:
-        return False
-
-
-def compute_vqa_accuracy(result_file, task):
-    """Compute VQA accuracy."""
-    merged_results = json.load(open(result_file))
-
-    vqa = VQAEval(vqa=None, vqaRes=None)
-    all_acc = []
-    for res in merged_results:
-        pred = res["answer"]
-        pred = vqa.processPunctuation(pred)
-        pred = vqa.processDigitArticle(pred)
-
-        gt = res["gt_answer"]
-        gt = [vqa.processPunctuation(ans) for ans in gt]
-        gt = [vqa.processDigitArticle(ans) for ans in gt]
-
-        # ChartQA uses relaxed accuracy:
-        # "We consider an answer to be correct if it is within 5% of the gold answer.
-        #  For non-numeric answers, we still need an exact match to consider an answer to be correct."
-        if task == "ChartQA":
-            acc = 0.0
-            assert len(gt) == 1, "expected exactly one groundtruth answer."
-            gt = gt[0]
-
-            pred = pred.rstrip("%")
-            gt = gt.rstrip("%")
-
-            if is_number(pred) and is_number(gt):
-                pred = float(pred)
-                gt = float(gt)
-                if pred >= (gt * 0.95) and pred <= (gt * 1.05):
-                    acc = 1.0
-            elif pred == gt:
-                acc = 1.0
-
-            all_acc.append(acc)
-        elif task in ("VQAv2", "TextVQA"):
-            num_match = sum([pred == ans for ans in gt])
-            acc = min(1.0, num_match / 3.0)
-            all_acc.append(acc)
-        elif task == "AI2D":
-            assert len(gt) == 1, f"Expected exactly 1 GT, got {gt}"
-            acc = pred == gt[0]
-            all_acc.append(acc)
-        else:
-            raise NotImplementedError(f"unknown task {task}")
-
-    acc_avg = sum(all_acc) / len(all_acc) * 100
-
-    return acc_avg
-
-
-def vqav2_eval(input_path):
-    """Run VQAv2 evaluation."""
-    result_file = merge_input_files(input_path)
-    avg_acc = compute_vqa_accuracy(result_file, task="VQAv2")
-    return avg_acc
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
-    args = parser.parse_args()
-
-    avg_acc = vqav2_eval(args.input_path)
-
-    print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====")
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from open_flamingo.eval.vqa_metric import VQAEval
+
+# ANLS score calculation based on https://github.com/shunk031/ANLS/blob/6472e1d71e84d6cee28e3c6d2e18564bafaa312d/anls/metrics/dist.py#L1
+# and https://github.com/shunk031/ANLS/blob/6472e1d71e84d6cee28e3c6d2e18564bafaa312d/anls/metrics/score.py#L6
+# MIT License. Copyright (c) 2022 Shunsuke KITADA
+def levenshtein_distance(s1: str, s2: str) -> int:
+
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = list(range(len(s1) + 1))
+    for i2, c2 in enumerate(s2):
+        dists = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                dists.append(distances[i1])
+            else:
+                dists.append(1 + min((distances[i1], distances[i1 + 1], dists[-1])))
+        distances = dists
+
+    return distances[-1]
+
+
+def normalized_levenshtein_distance(s1: str, s2: str) -> float:
+    dist = levenshtein_distance(s1, s2)
+    length = max(len(s1.upper()), len(s2.upper()))
+    return 0.0 if length == 0 else dist / length
+
+def similarity_function(prediction: str, gold_label: str, threshold: float) -> float:
+    nl_score = normalized_levenshtein_distance(prediction, gold_label)
+    return 1 - nl_score if nl_score < threshold else 0.0
+
+def anls_score(
+    prediction: str, gold_labels: List[str], threshold: float = 0.5
+) -> float:
+
+    # not case sensitive, but space sensitive
+    y_pred = " ".join(prediction.strip().lower().split())
+
+    anls_scores: List[float] = []
+    for gold_label in gold_labels:
+
+        # not case sensitive, but space sensitive
+        y_true = " ".join(gold_label.strip().lower().split())
+
+        anls_score = similarity_function(y_pred, y_true, threshold)
+        anls_scores.append(anls_score)
+
+    score = max(anls_scores)
+
+    return score
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
+
+    results = dict()
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+
+                # Skip possible duplicates.
+                if sample_id in results:
+                    continue
+
+                res["question_id"] = sample_id
+                results[sample_id] = res
+
+    results = list(results.values())
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def is_number(n: str):
+    """Check if input is a number."""
+    try:
+        float(n)
+        return True
+    except ValueError:
+        return False
+
+
+def compute_vqa_accuracy(result_file, task):
+    """Compute VQA accuracy."""
+    merged_results = json.load(open(result_file))
+
+    vqa = VQAEval(vqa=None, vqaRes=None)
+    all_acc = []
+    for res in merged_results:
+        pred = res["answer"]
+        pred = vqa.processPunctuation(pred)
+        pred = vqa.processDigitArticle(pred)
+
+        gt = res["gt_answer"]
+        gt = [vqa.processPunctuation(ans) for ans in gt]
+        gt = [vqa.processDigitArticle(ans) for ans in gt]
+
+        # ChartQA uses relaxed accuracy:
+        # "We consider an answer to be correct if it is within 5% of the gold answer.
+        #  For non-numeric answers, we still need an exact match to consider an answer to be correct."
+        if task == "ChartQA":
+            acc = 0.0
+            assert len(gt) == 1, "expected exactly one groundtruth answer."
+            gt = gt[0]
+
+            pred = pred.rstrip("%")
+            gt = gt.rstrip("%")
+
+            if is_number(pred) and is_number(gt):
+                pred = float(pred)
+                gt = float(gt)
+                if pred >= (gt * 0.95) and pred <= (gt * 1.05):
+                    acc = 1.0
+            elif pred == gt:
+                acc = 1.0
+
+            all_acc.append(acc)
+        elif task in ("VQAv2", "TextVQA"):
+            num_match = sum([pred == ans for ans in gt])
+            acc = min(1.0, num_match / 3.0)
+            all_acc.append(acc)
+        elif task in ("SPDocVQA", "InfoVQA"):
+            acc = anls_score(prediction=pred, gold_labels=gt, threshold=0.5)
+            all_acc.append(acc)
+        elif task == "AI2D":
+            assert len(gt) == 1, f"Expected exactly 1 GT, got {gt}"
+            acc = pred == gt[0]
+            all_acc.append(acc)
+        else:
+            raise NotImplementedError(f"unknown task {task}")
+
+    acc_avg = sum(all_acc) / len(all_acc) * 100
+
+    return acc_avg
+
+
+def vqav2_eval(input_path):
+    """Run VQAv2 evaluation."""
+    result_file = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file, task="VQAv2")
+    return avg_acc
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = vqav2_eval(args.input_path)
+
+    print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====")
--- a/examples/multimodal/evaluation/evaluation_datasets.py
+++ b/examples/multimodal/evaluation/evaluation_datasets.py
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-"""Evaluation datasets."""
-import glob
-import itertools
-import json
-import os
-import re
-from collections import defaultdict
-
-import numpy as np
-import torch
-from image_processing import get_visual_transform
-from PIL import Image
-
-from megatron.training import print_rank_0
-
-
-def _get_partition_bounds(
-    total_num_samples, num_samples_per_partition, num_partitions, partition_id
-):
-    if num_samples_per_partition == 0:
-        samples_per_partition = [
-            int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
-        ]
-        return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
-    return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
-
-
-class VQADataset(torch.utils.data.Dataset):
-    """VQA evaluation dataset."""
-
-    def __init__(
-        self,
-        input_image_path,
-        gt_path,
-        num_samples_per_partition,
-        num_partitions,
-        partition_id,
-        keys,
-        img_h,
-        img_w,
-        use_tiling,
-        max_num_tiles,
-        use_thumbnail,
-        vision_model_type,
-    ):
-        samples = json.load(open(gt_path, encoding='utf-8'))
-        if "data" in samples:
-            samples = samples["data"]
-
-        # Optionally, process only a subset of the input files.
-        if num_partitions > 0:
-            lb, ub = _get_partition_bounds(
-                len(samples), num_samples_per_partition, num_partitions, partition_id
-            )
-            samples = samples[lb:ub]
-
-        self._keys = keys
-        self._samples = samples
-        self._input_image_path = input_image_path
-        self._img_h = img_h
-        self._img_w = img_w
-        self._use_tiling = use_tiling
-        self._max_num_tiles = max_num_tiles
-        self._use_thumbnail = use_thumbnail
-        self._vision_model_type = vision_model_type
-
-    def __len__(self):
-        return len(self._samples)
-
-    def __getitem__(self, idx):
-        sample = self._samples[idx]
-
-        img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
-        if not os.path.exists(img_file):
-            img_file += ".jpg"
-
-            if not os.path.exists(img_file):
-                img_file = img_file.replace('.jpg', '.png')
-
-        img = Image.open(img_file)
-        imgs = get_visual_transform(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
-            vision_model_type=self._vision_model_type,
-        )
-        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
-
-        sample_id = idx
-        if "sample_id" in self._keys:
-            sample_id = sample[self._keys["sample_id"]]
-
-        metadata = ""  # Not used.
-
-        return (
-            torch.stack(imgs),
-            tile_count,
-            sample_id,
-            sample[self._keys["question"]],
-            sample[self._keys["answer"]],
-            metadata,
-        )
-
-
-class CaptioningDataset(torch.utils.data.Dataset):
-    """Captioning evaluation dataset."""
-
-    def __init__(
-        self,
-        input_image_path,
-        gt_path,
-        num_samples_per_partition,
-        num_partitions,
-        partition_id,
-        img_h,
-        img_w,
-        use_tiling,
-        max_num_tiles,
-        use_thumbnail,
-        vision_model_type,
-    ):
-        image_files = sorted(glob.glob(input_image_path + "/*"))
-
-        # Optionally, process only a subset of the input files.
-        if num_partitions > 0:
-            lb, ub = _get_partition_bounds(
-                len(image_files), num_samples_per_partition, num_partitions, partition_id
-            )
-            image_files = image_files[lb:ub]
-
-        gts = json.load(open(gt_path))
-        answers = defaultdict(list)
-        for gt in gts["annotations"]:
-            answers[gt["image_id"]].append(gt['caption'])
-
-        self._image_files = image_files
-        self._answers = answers
-        self._img_h = img_h
-        self._img_w = img_w
-        self._use_tiling = use_tiling
-        self._max_num_tiles = max_num_tiles
-        self._use_thumbnail = use_thumbnail
-        self._vision_model_type = vision_model_type
-
-    def __len__(self):
-        return len(self._image_files)
-
-    def __getitem__(self, idx):
-        img_file = self._image_files[idx]
-        image_id = int(img_file.split("_")[-1].split(".")[0])
-
-        img = Image.open(img_file)
-        imgs = get_visual_transform(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
-            vision_model_type=self._vision_model_type,
-        )
-
-        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
-
-        question = ""  # Fixed for all samples.
-        metadata = ""  # Not used.
-
-        return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
-
-
-class MMMUDataset(torch.utils.data.Dataset):
-    """MMMU evaluation dataset."""
-
-    def __init__(
-        self,
-        input_image_path,
-        num_samples_per_partition,
-        num_partitions,
-        partition_id,
-        img_h,
-        img_w,
-        use_tiling,
-        max_num_tiles,
-        use_thumbnail,
-        prompt_style,
-        vision_model_type,
-    ):
-        import datasets
-        from MMMU.mmmu.utils.data_utils import CAT_SHORT2LONG, load_yaml
-
-        # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
-        all_mmmu_datasets = []
-
-        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
-        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
-
-        for subject in CAT_SHORT2LONG.values():
-            # Use a local copy of the dataset if exists (can be faster) or the HF one.
-            if os.path.exists(input_image_path):
-                subject_dataset = datasets.load_dataset(
-                    os.path.join(input_image_path, subject),
-                    split=datasets.Split.VALIDATION,
-                    cache_dir=hf_datasets_cache,
-                    verification_mode="no_checks",
-                )
-            else:
-                subject_dataset = datasets.load_dataset(
-                    "MMMU/MMMU",
-                    subject,
-                    split=datasets.Split.VALIDATION,
-                    cache_dir=hf_datasets_cache,
-                )
-
-            all_mmmu_datasets.append(subject_dataset)
-
-        dataset = datasets.concatenate_datasets(all_mmmu_datasets)
-
-        dataset = [s for s in dataset if s['id'].startswith("val")]
-
-        # Optionally, process only a subset of the input files.
-        if num_partitions > 0:
-            lb, ub = _get_partition_bounds(
-                len(dataset), num_samples_per_partition, num_partitions, partition_id
-            )
-            dataset = dataset[lb:ub]
-
-        # Using the LLaVA config from the MMMU repo.
-        config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
-        for k, v in config.items():
-            if isinstance(v, list):
-                assert len(v) == 1, "only one value supported."
-                config[k] = v[0]
-
-        self._config = config
-
-        self._dataset = dataset
-
-        self._img_h = img_h
-        self._img_w = img_w
-        self._use_tiling = use_tiling
-        self._max_num_tiles = max_num_tiles
-        self._use_thumbnail = use_thumbnail
-        self._prompt_style = prompt_style
-        self._vision_model_type = vision_model_type
-
-    def __len__(self):
-        return len(self._dataset)
-
-    def __getitem__(self, idx):
-        from MMMU.mmmu.utils.data_utils import construct_prompt, process_single_sample
-
-        sample = self._dataset[idx]
-
-        # Use the single image approach from the MMMU repo.
-        if self._prompt_style == "single_image":
-            sample = process_single_sample(sample)
-            sample = construct_prompt(sample, self._config)
-
-            img = sample["image"]
-            sample_imgs = get_visual_transform(
-                img,
-                self._img_h,
-                self._img_w,
-                self._use_tiling,
-                self._max_num_tiles,
-                self._use_thumbnail,
-                augment=False,
-                vision_model_type=self._vision_model_type,
-            )
-            sample_num_tiles = [len(sample_imgs)]
-
-            prompt = sample["final_input_prompt"]
-            for i in range(8):
-                prompt = prompt.replace(f"<image {i}>", "")
-            sample["final_input_prompt"] = f"<image>\n{prompt}"
-        elif self._prompt_style == "vlmevalkit":
-            sample = construct_prompt(sample, self._config)
-
-            if sample["question_type"] == "multiple-choice":
-                question = sample["question"]
-
-                options = ""
-                for k, v in sample["index2ans"].items():
-                    options += f"{k}. {v}\n"
-
-                final_prompt = f"{question}\n"
-                if "hint" in sample:
-                    final_prompt += f"Hint: {sample['hint']}\n"
-
-                if "task_instructions" in sample:
-                    final_prompt += f"Task instructions: {sample['task_instructions']}\n"
-
-                final_prompt += options
-                final_prompt += "Answer with the option's letter from the given choices directly."
-
-                sample["final_input_prompt"] = final_prompt.rstrip()
-            else:
-                question = sample["question"]
-                final_prompt = f"{question}\n"
-                final_prompt += "Answer the question directly."
-                sample["final_input_prompt"] = final_prompt.rstrip()
-
-            sample_imgs = []
-            sample_num_tiles = []
-
-            img_indices = sorted(list(set(re.findall(r"<image (\d+)", sample["final_input_prompt"]))))
-            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
-            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
-            adjusted_max_num_tiles = min(adjusted_max_num_tiles, self._max_num_tiles)
-
-            for img_idx in img_indices:
-                img_key = f"image_{img_idx}"
-                img_str = f"<image {img_idx}>"
-
-                img = sample[img_key]
-                assert img is not None, f"{img_str} is in prompt but not in sample images"
-
-                imgs = get_visual_transform(
-                    img,
-                    self._img_h,
-                    self._img_w,
-                    self._use_tiling,
-                    adjusted_max_num_tiles,
-                    self._use_thumbnail,
-                    augment=False,
-                    vision_model_type=self._vision_model_type,
-                )  # List of tiles.
-
-                sample_imgs.extend(imgs)
-                sample_num_tiles.append(len(imgs))
-
-            sample["final_input_prompt"] = " ".join([f'<image {i + 1}><image>' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
-        elif self._prompt_style == "multi_image":
-            sample = construct_prompt(sample, self._config)
-
-            sample_imgs = []
-            sample_num_tiles = []
-
-            img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
-            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
-            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
-
-            for img_idx in img_indices:
-                img_key = f"image_{img_idx}"
-                img_str = f"<image {img_idx}>"
-
-                img = sample[img_key]
-                assert img is not None, f"{img_str} is in prompt but not in sample images"
-
-                # Note: Only replace the current image tag.
-                sample["final_input_prompt"] = sample["final_input_prompt"].replace(
-                    img_str, "<image>", 1
-                )
-
-                imgs = get_visual_transform(
-                    img,
-                    self._img_h,
-                    self._img_w,
-                    self._use_tiling,
-                    adjusted_max_num_tiles,
-                    self._use_thumbnail,
-                    augment=False,
-                    vision_model_type=self._vision_model_type,
-                )  # List of tiles.
-
-                sample_imgs.extend(imgs)
-                sample_num_tiles.append(len(imgs))
-
-            # Sanity check.
-            for i in range(1, 8):
-                assert (
-                    f"<image {i}>" not in sample["final_input_prompt"]
-                ), "prompt contains unhandled image tags"
-        else:
-            raise ValueError(f"unknown prompt style {self._prompt_style}")
-
-        # MMMU specific metadata.
-        metadata = {"question_type": sample["question_type"]}
-        if sample["question_type"] == "multiple-choice":
-            metadata["index2ans"] = sample["index2ans"]
-            metadata["all_choices"] = sample["all_choices"]
-
-        prompt = sample['final_input_prompt']
-
-        tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
-
-        return (
-            torch.stack(sample_imgs),
-            tile_count,
-            sample["id"],
-            prompt,
-            sample["answer"],
-            metadata,
-        )
-
-
-class VideoMMMEDataset(torch.utils.data.Dataset):
-    "Video MME evaluation dataset."
-
-    def __init__(
-        self,
-        input_image_path,
-        gt_path,
-        num_samples_per_partition,
-        num_partitions,
-        partition_id,
-        img_h,
-        img_w,
-        use_tiling,
-        max_num_tiles,
-        use_thumbnail,
-        num_frames,
-        vision_model_type,
-    ):
-        ground_truth_original = json.load(open(gt_path))
-        ground_truth = []
-        for gt in ground_truth_original:
-            video_path = gt["url"]
-            video_path = video_path.replace("https://www.youtube.com/watch?v=", "")
-            video_path = video_path.replace("https://m.youtube.com/watch?v=", "")
-            video_path = os.path.join(input_image_path, video_path + ".mp4")
-            if not os.path.exists(video_path):
-                continue
-            gt["video_path"] = video_path
-            ground_truth.append(gt)
-
-        ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
-        print_rank_0(f"Found {len(ground_truth)} videos to process.")
-
-        if num_partitions > 0:
-            start_idx, end_idx = _get_partition_bounds(
-                len(ground_truth), num_samples_per_partition, num_partitions, partition_id
-            )
-            ground_truth = ground_truth[start_idx:end_idx]
-
-        self._ground_truth = ground_truth
-        self._img_h = img_h
-        self._img_w = img_w
-        self._use_tiling = use_tiling
-        self._max_num_tiles = max_num_tiles
-        self._use_thumbnail = use_thumbnail
-        self._num_frames = num_frames
-        self._vision_model_type = vision_model_type
-
-    def __len__(self):
-        return len(self._ground_truth)
-
-    def __getitem__(self, idx):
-        from torchvision.io import read_video
-
-        gt = self._ground_truth[idx]
-
-        video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
-        video = video.numpy()
-        selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
-        video_frames = video[selected_frames]
-        if self._num_frames == 1:
-            video_frames = video_frames[None]
-
-        imgs = list(
-            itertools.chain.from_iterable(
-                get_visual_transform(
-                    img,
-                    self._img_h,
-                    self._img_w,
-                    self._use_tiling,
-                    self._max_num_tiles,
-                    self._use_thumbnail,
-                    augment=False,
-                    vision_model_type=self._vision_model_type,
-                )
-                for img in video_frames
-            )
-        )
-
-        for question in gt["questions"]:
-            # Very hacky, but we essentially re-create gt holding only the
-            # question of interest. This is the make this generation script
-            # compatible with the Video MME evaluation script.
-            question_dict = {
-                "video_id": gt["video_id"],
-                "duration_category": gt["duration_category"],
-                "video_category": gt["video_category"],
-                "video_subcategory": gt["video_subcategory"],
-                "url": gt["url"],
-                "questions": [question],
-            }
-
-        num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
-
-        answer = ""
-        metadata = ""
-
-        return (
-            torch.stack(imgs),
-            num_tiles,
-            question["question_id"],
-            question_dict,
-            answer,
-            metadata,
-        )
-
-
-class OCRBenchDataset(torch.utils.data.Dataset):
-    """OCRBench evaluation dataset."""
-
-    def __init__(
-        self,
-        input_image_path,
-        gt_path,
-        num_samples_per_partition,
-        num_partitions,
-        partition_id,
-        img_h,
-        img_w,
-        use_tiling,
-        max_num_tiles,
-        use_thumbnail,
-        vision_model_type,
-    ):
-        gt = json.load(open(gt_path, encoding='utf-8'))
-
-        if num_partitions > 0:
-            start_idx, end_idx = _get_partition_bounds(
-                len(gt), num_samples_per_partition, num_partitions, partition_id
-            )
-            gt = gt[start_idx:end_idx]
-
-        self._input_image_path = input_image_path
-        self._gt = gt
-        self._img_h = img_h
-        self._img_w = img_w
-        self._use_tiling = use_tiling
-        self._max_num_tiles = max_num_tiles
-        self._use_thumbnail = use_thumbnail
-        self._vision_model_type = vision_model_type
-
-    def __len__(self):
-        return len(self._gt)
-
-    def __getitem__(self, idx):
-        img_path = os.path.join(self._input_image_path, self._gt[idx]['image_path'])
-
-        img = Image.open(img_path)
-        imgs = get_visual_transform(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
-            vision_model_type=self._vision_model_type,
-        )
-
-        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
-
-        metadata = {
-            "dataset_name": self._gt[idx]["dataset_name"],
-            "data_type": self._gt[idx]["type"],
-        }
-
-        return (
-            torch.stack(imgs),
-            tile_count,
-            idx,
-            self._gt[idx]["question"],
-            self._gt[idx]["answers"],
-            metadata,
-        )
-
-
-class MathVistaDataset(torch.utils.data.Dataset):
-    """MathVista evaluation dataset."""
-
-    def __init__(
-        self,
-        input_image_path,
-        num_samples_per_partition,
-        num_partitions,
-        partition_id,
-        img_h,
-        img_w,
-        use_tiling,
-        max_num_tiles,
-        use_thumbnail,
-        vision_model_type,
-    ):
-        import datasets
-
-        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
-        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
-
-        if os.path.exists(input_image_path):
-            dataset = datasets.load_dataset(
-                input_image_path, cache_dir=hf_datasets_cache, verification_mode="no_checks"
-            )
-        else:
-            dataset = datasets.load_dataset(
-                "AI4Math/MathVista", split="testmini", cache_dir=hf_datasets_cache
-            )
-
-        if num_partitions > 0:
-            start_idx, end_idx = _get_partition_bounds(
-                len(dataset), num_samples_per_partition, num_partitions, partition_id
-            )
-            dataset = dataset[start_idx:end_idx]
-
-        self._dataset = dataset
-        self._img_h = img_h
-        self._img_w = img_w
-        self._use_tiling = use_tiling
-        self._max_num_tiles = max_num_tiles
-        self._use_thumbnail = use_thumbnail
-        self._vision_model_type = vision_model_type
-
-    def __len__(self):
-        return len(self._dataset["pid"])
-
-    def __getitem__(self, idx):
-        # Already a PIL object.
-        img = self._dataset['decoded_image'][idx]
-
-        imgs = get_visual_transform(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
-            vision_model_type=self._vision_model_type,
-        )
-
-        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
-
-        question_id = self._dataset["pid"][idx]
-        question = self._dataset["question"][idx]
-        question_type = self._dataset["question_type"][idx]  # free_form or multi_choice
-        query = self._dataset["query"][idx]
-        choices = self._dataset["choices"][idx]
-        answer = self._dataset["answer"][idx]
-
-        if question_type == 'multi_choice':
-            start_chr = 'A'
-            choices_str = ''
-            index2ans = {}
-            all_choices = []
-            for choice in choices:
-                all_choices.append(start_chr)
-                index2ans[start_chr] = choice
-                choices_str += f"{start_chr}. {choice}\n"
-                start_chr = chr(ord(start_chr) + 1)
-
-            question = question + '\n' + choices_str
-            question = question + "Answer with the option's letter from the given choices directly."
-            answer = chr(ord('A') + choices.index(answer))
-        else:
-            question = query.replace("Hint: ", "")
-            index2ans = {}
-            all_choices = []
-
-        metadata = {
-            "question_type": question_type,
-            "index2ans": index2ans,
-            "all_choices": all_choices,
-        }
-
-        return torch.stack(imgs), tile_count, question_id, question, answer, metadata
-
-
-class AI2DDataset(torch.utils.data.Dataset):
-    """AI2D evaluation dataset."""
-
-    def __init__(
-        self,
-        input_image_path,
-        gt_path,
-        num_samples_per_partition,
-        num_partitions,
-        partition_id,
-        img_h,
-        img_w,
-        use_tiling,
-        max_num_tiles,
-        use_thumbnail,
-        no_mask,
-        vision_model_type,
-    ):
-        with open(gt_path, 'r') as f:
-            jsonl = list(f)
-
-        gt = [json.loads(json_str) for json_str in jsonl]
-
-        if num_partitions > 0:
-            start_idx, end_idx = _get_partition_bounds(
-                len(gt), num_samples_per_partition, num_partitions, partition_id
-            )
-            gt = gt[start_idx:end_idx]
-
-        self._gt = gt
-        self._input_image_path = input_image_path
-        self._img_h = img_h
-        self._img_w = img_w
-        self._use_tiling = use_tiling
-        self._max_num_tiles = max_num_tiles
-        self._use_thumbnail = use_thumbnail
-        self._no_mask = no_mask
-        self._vision_model_type = vision_model_type
-
-    def __len__(self):
-        return len(self._gt)
-
-    def __getitem__(self, idx):
-        img_path = os.path.join(self._input_image_path, self._gt[idx]['image'])
-        if self._no_mask:
-            img_path.replace("AI2D_TEST", "AI2D_TEST_NO_MASK_IMAGES")
-
-        img = Image.open(img_path)
-        imgs = get_visual_transform(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
-            vision_model_type=self._vision_model_type,
-        )
-
-        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
-
-        metadata = ""  # Not used.
-
-        return (
-            torch.stack(imgs),
-            tile_count,
-            self._gt[idx]["question_id"],
-            self._gt[idx]["question"],
-            self._gt[idx]["answer"],
-            metadata,
-        )
-
-
-def get_evaluation_dataset(
-    task,
-    input_image_path,
-    gt_path,
-    img_h,
-    img_w,
-    use_tiling,
-    max_num_tiles,
-    use_thumbnail,
-    num_samples_per_partition,
-    num_partitions,
-    partition_id,
-    num_frames,
-    vision_model_type,
-):
-    """Get an evaluation dataset."""
-    if task == "TextVQA":
-        keys = {
-            "image_id": "image_id",
-            "sample_id": "question_id",
-            "question": "question",
-            "answer": "answers",
-        }
-
-        dataset = VQADataset(
-            input_image_path,
-            gt_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            keys,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            vision_model_type,
-        )
-    elif task == "VQAv2":
-        keys = {
-            "image_id": "image",
-            "sample_id": "question_id",
-            "question": "question",
-            "answer": "answer",
-        }
-
-        dataset = VQADataset(
-            input_image_path,
-            gt_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            keys,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            vision_model_type,
-        )
-    elif task == "ChartQA":
-        keys = {"image_id": "imgname", "question": "query", "answer": "label"}
-
-        dataset = VQADataset(
-            input_image_path,
-            gt_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            keys,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            vision_model_type,
-        )
-    elif task == "captioning":
-        dataset = CaptioningDataset(
-            input_image_path,
-            gt_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            vision_model_type,
-        )
-    elif task == 'MMMU':
-        # Note:
-        # - prompt_style="single_image" uses only one image like in the MMMU repo example.
-        # - prompt_style="multi_image" uses multiple input images.
-        # - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
-        dataset = MMMUDataset(
-            input_image_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            prompt_style="single_image",
-            vision_model_type=vision_model_type,
-        )
-    elif task == "VideoMME":
-        dataset = VideoMMMEDataset(
-            input_image_path,
-            gt_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            num_frames,
-            vision_model_type,
-        )
-    elif task == "OCRBench":
-        dataset = OCRBenchDataset(
-            input_image_path,
-            gt_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            vision_model_type,
-        )
-    elif task == "MathVista":
-        dataset = MathVistaDataset(
-            input_image_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            vision_model_type,
-        )
-    elif task == "AI2D":
-        dataset = AI2DDataset(
-            input_image_path,
-            gt_path,
-            num_samples_per_partition,
-            num_partitions,
-            partition_id,
-            img_h,
-            img_w,
-            use_tiling,
-            max_num_tiles,
-            use_thumbnail,
-            no_mask=False,
-            vision_model_type=vision_model_type,
-        )
-    else:
-        raise NotImplementedError(f"unsupported task {task}")
-
-    return dataset
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Evaluation datasets."""
+import glob
+import itertools
+import json
+import os
+import re
+from collections import defaultdict
+
+import numpy as np
+import torch
+from image_processing import get_visual_transform
+from PIL import Image
+
+from megatron.training import print_rank_0
+
+
+def _get_partition_bounds(
+    total_num_samples, num_samples_per_partition, num_partitions, partition_id
+):
+    if num_samples_per_partition == 0:
+        samples_per_partition = [
+            int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
+        ]
+        return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
+    return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
+
+
+class VQADataset(torch.utils.data.Dataset):
+    """VQA evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        keys,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        samples = json.load(open(gt_path, encoding='utf-8'))
+        if "data" in samples:
+            samples = samples["data"]
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(samples), num_samples_per_partition, num_partitions, partition_id
+            )
+            samples = samples[lb:ub]
+
+        self._keys = keys
+        self._samples = samples
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._samples)
+
+    def __getitem__(self, idx):
+        sample = self._samples[idx]
+
+        img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
+        if not os.path.exists(img_file):
+            img_file += ".jpg"
+
+            if not os.path.exists(img_file):
+                img_file = img_file.replace('.jpg', '.png')
+
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        sample_id = idx
+        if "sample_id" in self._keys:
+            sample_id = sample[self._keys["sample_id"]]
+
+        metadata = ""  # Not used.
+
+        return (
+            torch.stack(imgs),
+            tile_count,
+            sample_id,
+            sample[self._keys["question"]],
+            sample[self._keys["answer"]],
+            metadata,
+        )
+
+
+class CaptioningDataset(torch.utils.data.Dataset):
+    """Captioning evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        image_files = sorted(glob.glob(input_image_path + "/*"))
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(image_files), num_samples_per_partition, num_partitions, partition_id
+            )
+            image_files = image_files[lb:ub]
+
+        gts = json.load(open(gt_path))
+        answers = defaultdict(list)
+        for gt in gts["annotations"]:
+            answers[gt["image_id"]].append(gt['caption'])
+
+        self._image_files = image_files
+        self._answers = answers
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._image_files)
+
+    def __getitem__(self, idx):
+        img_file = self._image_files[idx]
+        image_id = int(img_file.split("_")[-1].split(".")[0])
+
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        question = ""  # Fixed for all samples.
+        metadata = ""  # Not used.
+
+        return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
+
+
+class MMMUDataset(torch.utils.data.Dataset):
+    """MMMU evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        prompt_style,
+        vision_model_type,
+    ):
+        import datasets
+        from MMMU.mmmu.utils.data_utils import CAT_SHORT2LONG, load_yaml
+
+        # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
+        all_mmmu_datasets = []
+
+        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+
+        for subject in CAT_SHORT2LONG.values():
+            # Use a local copy of the dataset if exists (can be faster) or the HF one.
+            if os.path.exists(input_image_path):
+                subject_dataset = datasets.load_dataset(
+                    os.path.join(input_image_path, subject),
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                    verification_mode="no_checks",
+                )
+            else:
+                subject_dataset = datasets.load_dataset(
+                    "MMMU/MMMU",
+                    subject,
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                )
+
+            all_mmmu_datasets.append(subject_dataset)
+
+        dataset = datasets.concatenate_datasets(all_mmmu_datasets)
+
+        dataset = [s for s in dataset if s['id'].startswith("val")]
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(dataset), num_samples_per_partition, num_partitions, partition_id
+            )
+            dataset = dataset[lb:ub]
+
+        # Using the LLaVA config from the MMMU repo.
+        config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
+        for k, v in config.items():
+            if isinstance(v, list):
+                assert len(v) == 1, "only one value supported."
+                config[k] = v[0]
+
+        self._config = config
+
+        self._dataset = dataset
+
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._prompt_style = prompt_style
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, idx):
+        from MMMU.mmmu.utils.data_utils import construct_prompt, process_single_sample
+
+        sample = self._dataset[idx]
+
+        # Use the single image approach from the MMMU repo.
+        if self._prompt_style == "single_image":
+            sample = process_single_sample(sample)
+            sample = construct_prompt(sample, self._config)
+
+            img = sample["image"]
+            sample_imgs = get_visual_transform(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+                vision_model_type=self._vision_model_type,
+            )
+            sample_num_tiles = [len(sample_imgs)]
+
+            prompt = sample["final_input_prompt"]
+            for i in range(8):
+                prompt = prompt.replace(f"<image {i}>", "")
+            sample["final_input_prompt"] = f"<image>\n{prompt}"
+        elif self._prompt_style == "vlmevalkit":
+            sample = construct_prompt(sample, self._config)
+
+            if sample["question_type"] == "multiple-choice":
+                question = sample["question"]
+
+                options = ""
+                for k, v in sample["index2ans"].items():
+                    options += f"{k}. {v}\n"
+
+                final_prompt = f"{question}\n"
+                if "hint" in sample:
+                    final_prompt += f"Hint: {sample['hint']}\n"
+
+                if "task_instructions" in sample:
+                    final_prompt += f"Task instructions: {sample['task_instructions']}\n"
+
+                final_prompt += options
+                final_prompt += "Answer with the option's letter from the given choices directly."
+
+                sample["final_input_prompt"] = final_prompt.rstrip()
+            else:
+                question = sample["question"]
+                final_prompt = f"{question}\n"
+                final_prompt += "Answer the question directly."
+                sample["final_input_prompt"] = final_prompt.rstrip()
+
+            sample_imgs = []
+            sample_num_tiles = []
+
+            img_indices = sorted(list(set(re.findall(r"<image (\d+)", sample["final_input_prompt"]))))
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+            adjusted_max_num_tiles = min(adjusted_max_num_tiles, self._max_num_tiles)
+
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )  # List of tiles.
+
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+
+            sample["final_input_prompt"] = " ".join([f'<image {i + 1}><image>' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
+        elif self._prompt_style == "multi_image":
+            sample = construct_prompt(sample, self._config)
+
+            sample_imgs = []
+            sample_num_tiles = []
+
+            img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                # Note: Only replace the current image tag.
+                sample["final_input_prompt"] = sample["final_input_prompt"].replace(
+                    img_str, "<image>", 1
+                )
+
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )  # List of tiles.
+
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+
+            # Sanity check.
+            for i in range(1, 8):
+                assert (
+                    f"<image {i}>" not in sample["final_input_prompt"]
+                ), "prompt contains unhandled image tags"
+        else:
+            raise ValueError(f"unknown prompt style {self._prompt_style}")
+
+        # MMMU specific metadata.
+        metadata = {"question_type": sample["question_type"]}
+        if sample["question_type"] == "multiple-choice":
+            metadata["index2ans"] = sample["index2ans"]
+            metadata["all_choices"] = sample["all_choices"]
+
+        prompt = sample['final_input_prompt']
+
+        tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
+
+        return (
+            torch.stack(sample_imgs),
+            tile_count,
+            sample["id"],
+            prompt,
+            sample["answer"],
+            metadata,
+        )
+
+
+class VideoMMEDataset(torch.utils.data.Dataset):
+    "Video MME evaluation dataset."
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        num_frames,
+        vision_model_type,
+    ):
+        ground_truth_original = json.load(open(gt_path))
+        ground_truth = []
+        for gt in ground_truth_original:
+            video_path = gt["url"]
+            video_path = video_path.replace("https://www.youtube.com/watch?v=", "")
+            video_path = video_path.replace("https://m.youtube.com/watch?v=", "")
+            video_path = os.path.join(input_image_path, video_path + ".mp4")
+            if not os.path.exists(video_path):
+                continue
+            gt["video_path"] = video_path
+            ground_truth.append(gt)
+
+        ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
+        print_rank_0(f"Found {len(ground_truth)} videos to process.")
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(ground_truth), num_samples_per_partition, num_partitions, partition_id
+            )
+            ground_truth = ground_truth[start_idx:end_idx]
+
+        self._ground_truth = ground_truth
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = False
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._num_frames = num_frames
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._ground_truth)
+
+    def __getitem__(self, idx):
+        from torchvision.io import read_video
+
+        gt = self._ground_truth[idx]
+
+        video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
+        video = video.numpy()
+        selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
+        video_frames = video[selected_frames]
+        if self._num_frames == 1:
+            video_frames = video_frames[None]
+
+        imgs = []
+        for img in video_frames:
+            from torchvision.transforms import ToPILImage
+            to_pil = ToPILImage()
+            img = to_pil(img)
+            imgs += get_visual_transform(
+                img, self._img_h, self._img_w, self._use_tiling, self._max_num_tiles,
+                self._use_thumbnail, augment=False, vision_model_type=self._vision_model_type
+            )
+
+        for question in gt["questions"]:
+            # Very hacky, but we essentially re-create gt holding only the
+            # question of interest. This is the make this generation script
+            # compatible with the Video MME evaluation script.
+            question_dict = {
+                "video_id": gt["video_id"],
+                "duration_category": gt["duration_category"],
+                "video_category": gt["video_category"],
+                "video_subcategory": gt["video_subcategory"],
+                "url": gt["url"],
+                "questions": [question],
+            }
+
+        num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
+
+        answer = ""
+        metadata = ""
+
+        return (
+            torch.stack(imgs),
+            num_tiles,
+            question["question_id"],
+            question_dict,
+            answer,
+            metadata,
+        )
+
+
+class OCRBenchDataset(torch.utils.data.Dataset):
+    """OCRBench evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        gt = json.load(open(gt_path, encoding='utf-8'))
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(gt), num_samples_per_partition, num_partitions, partition_id
+            )
+            gt = gt[start_idx:end_idx]
+
+        self._input_image_path = input_image_path
+        self._gt = gt
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._gt)
+
+    def __getitem__(self, idx):
+        img_path = os.path.join(self._input_image_path, self._gt[idx]['image_path'])
+
+        img = Image.open(img_path)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        metadata = {
+            "dataset_name": self._gt[idx]["dataset_name"],
+            "data_type": self._gt[idx]["type"],
+        }
+
+        return (
+            torch.stack(imgs),
+            tile_count,
+            idx,
+            self._gt[idx]["question"],
+            self._gt[idx]["answers"],
+            metadata,
+        )
+
+
+class MathVistaDataset(torch.utils.data.Dataset):
+    """MathVista evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        import datasets
+
+        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+
+        if os.path.exists(input_image_path):
+            dataset = datasets.load_dataset(
+                input_image_path, cache_dir=hf_datasets_cache, verification_mode="no_checks"
+            )
+        else:
+            dataset = datasets.load_dataset(
+                "AI4Math/MathVista", split="testmini", cache_dir=hf_datasets_cache
+            )
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(dataset), num_samples_per_partition, num_partitions, partition_id
+            )
+            dataset = dataset[start_idx:end_idx]
+
+        self._dataset = dataset
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._dataset["pid"])
+
+    def __getitem__(self, idx):
+        # Already a PIL object.
+        img = self._dataset['decoded_image'][idx]
+
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        question_id = self._dataset["pid"][idx]
+        question = self._dataset["question"][idx]
+        question_type = self._dataset["question_type"][idx]  # free_form or multi_choice
+        query = self._dataset["query"][idx]
+        choices = self._dataset["choices"][idx]
+        answer = self._dataset["answer"][idx]
+
+        if question_type == 'multi_choice':
+            start_chr = 'A'
+            choices_str = ''
+            index2ans = {}
+            all_choices = []
+            for choice in choices:
+                all_choices.append(start_chr)
+                index2ans[start_chr] = choice
+                choices_str += f"{start_chr}. {choice}\n"
+                start_chr = chr(ord(start_chr) + 1)
+
+            question = question + '\n' + choices_str
+            question = question + "Answer with the option's letter from the given choices directly."
+            answer = chr(ord('A') + choices.index(answer))
+        else:
+            question = query.replace("Hint: ", "")
+            index2ans = {}
+            all_choices = []
+
+        metadata = {
+            "question_type": question_type,
+            "index2ans": index2ans,
+            "all_choices": all_choices,
+        }
+
+        return torch.stack(imgs), tile_count, question_id, question, answer, metadata
+
+
+class AI2DDataset(torch.utils.data.Dataset):
+    """AI2D evaluation dataset."""
+
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        no_mask,
+        vision_model_type,
+    ):
+        with open(gt_path, 'r') as f:
+            jsonl = list(f)
+
+        gt = [json.loads(json_str) for json_str in jsonl]
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(gt), num_samples_per_partition, num_partitions, partition_id
+            )
+            gt = gt[start_idx:end_idx]
+
+        self._gt = gt
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._no_mask = no_mask
+        self._vision_model_type = vision_model_type
+
+    def __len__(self):
+        return len(self._gt)
+
+    def __getitem__(self, idx):
+        img_path = os.path.join(self._input_image_path, self._gt[idx]['image'])
+        if self._no_mask:
+            img_path.replace("AI2D_TEST", "AI2D_TEST_NO_MASK_IMAGES")
+
+        img = Image.open(img_path)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        metadata = ""  # Not used.
+
+        return (
+            torch.stack(imgs),
+            tile_count,
+            self._gt[idx]["question_id"],
+            self._gt[idx]["question"],
+            self._gt[idx]["answer"],
+            metadata,
+        )
+
+
+def get_evaluation_dataset(
+    task,
+    input_image_path,
+    gt_path,
+    img_h,
+    img_w,
+    use_tiling,
+    max_num_tiles,
+    use_thumbnail,
+    num_samples_per_partition,
+    num_partitions,
+    partition_id,
+    num_frames,
+    vision_model_type,
+):
+    """Get an evaluation dataset."""
+    if task == "TextVQA":
+        keys = {
+            "image_id": "image_id",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answers",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "VQAv2":
+        keys = {
+            "image_id": "image",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answer",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "ChartQA":
+        keys = {"image_id": "imgname", "question": "query", "answer": "label"}
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "captioning":
+        dataset = CaptioningDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == 'MMMU':
+        # Note:
+        # - prompt_style="single_image" uses only one image like in the MMMU repo example.
+        # - prompt_style="multi_image" uses multiple input images.
+        # - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
+        dataset = MMMUDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            prompt_style="single_image",
+            vision_model_type=vision_model_type,
+        )
+    elif task == "VideoMME":
+        dataset = VideoMMEDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            num_frames,
+            vision_model_type,
+        )
+    elif task == "OCRBench":
+        dataset = OCRBenchDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "MathVista":
+        dataset = MathVistaDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "AI2D":
+        dataset = AI2DDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            no_mask=False,
+            vision_model_type=vision_model_type,
+        )
+    elif task == "SPDocVQA":
+        keys = {"sample_id": "questionId", "image_id": "image", "question": "question", "answer": "answers"}
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "InfoVQA":
+        keys = {"sample_id": "questionId", "image_id": "image_local_name", "question": "question", "answer": "answers"}
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    else:
+        raise NotImplementedError(f"unsupported task {task}")
+
+    return dataset
--- a/examples/multimodal/image_processing.py
+++ b/examples/multimodal/image_processing.py
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved. Except portions as noted which are Copyright (c) 2023 OpenGVLab and licensed under the MIT license found in LICENSE.
-from torchvision import transforms as T
-from torchvision.transforms import Compose
-from torchvision.transforms.functional import InterpolationMode
-
-
-IMAGENET_PIXEL_MEAN = [0.485, 0.456, 0.406]
-IMAGENET_PIXEL_STD = [0.229, 0.224, 0.225]
-SIGLIP_PIXEL_MEAN = [0.5, 0.5, 0.5]
-SIGLIP_PIXEL_STD = [0.5, 0.5, 0.5]
-CLIP_PIXEL_MEAN = [0.48145466, 0.4578275, 0.40821073]
-CLIP_PIXEL_STD = [0.26862954, 0.26130258, 0.27577711]
-
-
-pixel_statistics = {
-    "clip": (CLIP_PIXEL_MEAN, CLIP_PIXEL_STD),
-    "siglip": (SIGLIP_PIXEL_MEAN, SIGLIP_PIXEL_STD),
-    "internvit": (IMAGENET_PIXEL_MEAN, IMAGENET_PIXEL_STD),
-}
-
-
-def get_visual_transform(img, img_h, img_w, use_tiling=False, max_num_tiles=1, use_thumbnail=False, augment=False, vision_model_type="clip"):
-    pixel_mean, pixel_std = pixel_statistics[vision_model_type]
-
-    assert not augment, "Image augmentation not implemented."
-    transform = build_transform(img_h, pixel_mean, pixel_std, vision_model_type)
-
-    if use_tiling:
-        assert img_h == img_w, "dynamic tiling expects equal tile height and width"
-        imgs = dynamic_preprocess(img, min_num=1, max_num=max_num_tiles, image_size=img_h, use_thumbnail=use_thumbnail)
-        imgs = [transform(img) for img in imgs]
-    else:
-        imgs = [transform(img)]
-
-    return imgs
-
-
-# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L685
-# Copyright (c) 2023 OpenGVLab.
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
-    return best_ratio
-
-
-# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L702
-# Copyright (c) 2023 OpenGVLab.
-def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
-    orig_width, orig_height = image.size
-    aspect_ratio = orig_width / orig_height
-
-    # calculate the existing image aspect ratio
-    target_ratios = set(
-        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-        i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    assert len(processed_images) == blocks
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images
-
-
-# Based on https://github.com/openai/CLIP/blob/dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1/clip/clip.py#L79
-# and https://github.com/OpenGVLab/InternVL/blob/aa521e6eb1df4cf153aa4118fcf13e673c055d46/internvl_chat/internvl/train/dataset.py#L276
-def build_transform(input_size, pixel_mean, pixel_std, vision_model_type):
-    if vision_model_type in ("siglip", "internvit"):
-        transform = T.Compose([
-            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
-            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
-            T.ToTensor(),
-            T.Normalize(mean=pixel_mean, std=pixel_std)
-        ])
-    elif vision_model_type == "clip":
-        transform = Compose([
-            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
-            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
-            T.ToTensor(),
-            T.Normalize(mean=pixel_mean, std=pixel_std),
-        ])
-    else:
-        raise NotImplementedError(f"image processing not defined for vision model {vision_model_type}")
-
-    return transform
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved. Except portions as noted which are Copyright (c) 2023 OpenGVLab and licensed under the MIT license found in LICENSE.
+from torchvision import transforms as T
+from torchvision.transforms import Compose
+from torchvision.transforms.functional import InterpolationMode
+
+
+IMAGENET_PIXEL_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_PIXEL_STD = [0.229, 0.224, 0.225]
+SIGLIP_PIXEL_MEAN = [0.5, 0.5, 0.5]
+SIGLIP_PIXEL_STD = [0.5, 0.5, 0.5]
+CLIP_PIXEL_MEAN = [0.48145466, 0.4578275, 0.40821073]
+CLIP_PIXEL_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+pixel_statistics = {
+    "clip": (CLIP_PIXEL_MEAN, CLIP_PIXEL_STD),
+    "siglip": (SIGLIP_PIXEL_MEAN, SIGLIP_PIXEL_STD),
+    "internvit": (IMAGENET_PIXEL_MEAN, IMAGENET_PIXEL_STD),
+    "radio": (CLIP_PIXEL_MEAN, CLIP_PIXEL_STD),
+    "huggingface": (SIGLIP_PIXEL_MEAN, SIGLIP_PIXEL_STD),
+}
+
+
+# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L685
+# Copyright (c) 2023 OpenGVLab.
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def find_closest_area_weighted_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    """
+    Find the best number of tiles based on the aspect ratio and the area covered by the tiles.
+    """
+    best_factor = float('-inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        factor_based_on_area_n_ratio = (
+            min((ratio[0]*ratio[1]*image_size*image_size)/ area, 0.6) *
+            min(target_aspect_ratio/aspect_ratio, aspect_ratio/target_aspect_ratio))
+        if factor_based_on_area_n_ratio > best_factor:
+            best_factor = factor_based_on_area_n_ratio
+            best_ratio = ratio
+    return best_ratio
+
+
+def get_visual_transform(
+    img, img_h, img_w, use_tiling=False, max_num_tiles=1, use_thumbnail=False, augment=False,
+    vision_model_type="clip", find_closest_aspect_ratio_fn=find_closest_aspect_ratio):
+    pixel_mean, pixel_std = pixel_statistics[vision_model_type]
+
+    assert not augment, "Image augmentation not implemented."
+    transform = build_transform(img_h, pixel_mean, pixel_std, vision_model_type)
+
+    if use_tiling:
+        assert img_h == img_w, "dynamic tiling expects equal tile height and width"
+        imgs = dynamic_preprocess(
+            img, min_num=1, max_num=max_num_tiles, image_size=img_h, use_thumbnail=use_thumbnail,
+            find_closest_aspect_ratio_fn=find_closest_aspect_ratio_fn)
+        imgs = [transform(img) for img in imgs]
+    else:
+        imgs = [transform(img)]
+
+    return imgs
+
+
+# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L702
+# Copyright (c) 2023 OpenGVLab.
+def dynamic_preprocess(
+    image, min_num=1, max_num=6, image_size=448, use_thumbnail=False,
+    find_closest_aspect_ratio_fn=find_closest_aspect_ratio):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio_fn(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+# Based on https://github.com/openai/CLIP/blob/dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1/clip/clip.py#L79
+# and https://github.com/OpenGVLab/InternVL/blob/aa521e6eb1df4cf153aa4118fcf13e673c055d46/internvl_chat/internvl/train/dataset.py#L276
+def build_transform(input_size, pixel_mean, pixel_std, vision_model_type):
+    if vision_model_type in ("siglip", "internvit", "radio", "huggingface"):
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=pixel_mean, std=pixel_std)
+        ])
+    elif vision_model_type == "clip":
+        transform = Compose([
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.ToTensor(),
+            T.Normalize(mean=pixel_mean, std=pixel_std),
+        ])
+    else:
+        raise NotImplementedError(f"image processing not defined for vision model {vision_model_type}")
+
+    return transform
--- a/examples/multimodal/layer_specs.py
+++ b/examples/multimodal/layer_specs.py
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-import torch
-
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-from megatron.core.transformer.dot_product_attention import DotProductAttention
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.identity_op import IdentityOp
-from megatron.core.transformer.mlp import MLP, MLPSubmodules
-from megatron.core.transformer.spec_utils import ModuleSpec
-from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
-
-try:
-    from megatron.core.extensions.transformer_engine import (
-        TEColumnParallelLinear,
-        TEDotProductAttention,
-        TELayerNormColumnParallelLinear,
-        TENorm,
-        TERowParallelLinear,
-    )
-
-    HAVE_TE = True
-except ImportError:
-    HAVE_TE = False
-
-try:
-    import apex
-
-    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
-    from megatron.core.transformer.torch_norm import WrappedTorchNorm
-
-    HAVE_APEX = True
-    LNImpl = FusedLayerNorm
-except ImportError:
-    import warnings
-
-    from megatron.core.transformer.torch_norm import WrappedTorchNorm
-
-    warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
-    LNImpl = WrappedTorchNorm
-
-
-def get_layer_spec(is_vit, normalization) -> ModuleSpec:
-    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
-    if normalization == "LayerNorm":
-        norm = LNImpl
-    elif normalization == "RMSNorm":
-        if HAVE_TE:
-            norm = TENorm
-        else:
-            version = torch.__version__.split('.')
-            version_geq_2_4 = (
-                int(TORCH_VERSION[0]) > 2
-                or (
-                    int(TORCH_VERSION[0]) == 2
-                    and int(TORCH_VERSION[1]) >= 4
-                )
-            )
-            assert version_geq_2_4, "Torch version >= 2.4.0 is required for RMSNorm"
-            if HAVE_APEX:
-                warnings.warn(f'Apex does not support RMSNorm. Falling back to Torch Norm')
-            norm = WrappedTorchNorm
-    else:
-        raise RuntimeError("unknown normalization", normalization)
-
-    mlp = get_mlp_module_spec(use_te=False)  # doesn't include norm.
-
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            input_layernorm=norm,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": attn_mask_type},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
-                    linear_proj=RowParallelLinear,
-                    q_layernorm=IdentityOp,
-                    k_layernorm=IdentityOp,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=norm,
-            mlp=mlp,
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
-
-
-def get_layer_spec_te(is_vit=False) -> ModuleSpec:
-    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
-
-    mlp = get_norm_mlp_module_spec_te()
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": attn_mask_type},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                    q_layernorm=IdentityOp,
-                    k_layernorm=IdentityOp,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=IdentityOp,
-            mlp=mlp,
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
-
-
-def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
-    # Dense MLP w/ or w/o TE modules.
-    return ModuleSpec(
-        module=MLP,
-        submodules=MLPSubmodules(
-            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
-            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
-        ),
-    )
-
-
-def get_norm_mlp_module_spec_te() -> ModuleSpec:
-    return ModuleSpec(
-        module=MLP,
-        submodules=MLPSubmodules(
-            linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
-        ),
-    )
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
+    LNImpl = WrappedTorchNorm
+
+
+def get_layer_spec(is_vit, normalization) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+    if normalization == "LayerNorm":
+        norm = LNImpl
+    elif normalization == "RMSNorm":
+        if HAVE_TE:
+            norm = TENorm
+        else:
+            version = torch.__version__.split('.')
+            version_geq_2_4 = (
+                int(TORCH_VERSION[0]) > 2
+                or (
+                    int(TORCH_VERSION[0]) == 2
+                    and int(TORCH_VERSION[1]) >= 4
+                )
+            )
+            assert version_geq_2_4, "Torch version >= 2.4.0 is required for RMSNorm"
+            if HAVE_APEX:
+                warnings.warn(f'Apex does not support RMSNorm. Falling back to Torch Norm')
+            norm = WrappedTorchNorm
+    else:
+        raise RuntimeError("unknown normalization", normalization)
+
+    mlp = get_mlp_module_spec(use_te=False)  # doesn't include norm.
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=norm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=norm,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_layer_spec_te(is_vit=False, padding=False) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+    # Padding mask is needed for e.g. Context Parallel.
+    if padding:
+        assert not is_vit, "padding_causal mask not used with ViT"
+        attn_mask_type = AttnMaskType.padding_causal
+
+    mlp = get_norm_mlp_module_spec_te()
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=IdentityOp,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+
+
+def get_norm_mlp_module_spec_te() -> ModuleSpec:
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+        ),
+    )
--- a/examples/multimodal/model.py
+++ b/examples/multimodal/model.py
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-import warnings
-from copy import deepcopy
-
-import torch
-from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
-from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec, get_norm_mlp_module_spec_te
-
-from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN, LLaVAModel
-from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
-from megatron.training import get_args, get_tokenizer, print_rank_0
-from megatron.training.arguments import core_transformer_config_from_args
-
-
-def model_provider(
-    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
-) -> LLaVAModel:
-    """Builds the model.
-
-    Args:
-        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
-        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
-        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
-            will live on only a subset of the pipeline stages (specifically, only the first stage).
-        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
-            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
-        parallel_output (bool): Enable parallel model output.
-
-    Returns:
-        model: A multimodal model.
-    """
-    args = get_args()
-    assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."
-    assert args.encoder_pipeline_model_parallel_size <= 1, "LLaVA does not support pp>1 for encoder on it's own pipeline rank"
-
-    use_te = args.use_te
-
-    print_rank_0('building a multimodal model ...')
-
-    num_image_embeddings = get_num_image_embeddings(
-        args.img_h,
-        args.img_w,
-        args.patch_dim,
-        args.vision_model_type,
-        args.disable_vision_class_token,
-        1,
-        args.pixel_shuffle,
-        args.use_tile_tags,
-    )
-    old_seq_length = args.seq_length
-    args.seq_length = args.encoder_seq_length = num_image_embeddings
-    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
-        warnings.warn(
-            f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
-        )
-
-    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
-
-    assert (
-        args.decoder_seq_length is not None
-    ), "Please provide --decoder-seq-length to set the language model sequence length"
-    assert (
-        args.decoder_seq_length > max_num_image_embeddings
-    ), "Language model sequence length must be greater than the maximum number of image embeddings"
-    if args.decoder_seq_length > args.max_position_embeddings:
-        args.max_position_embeddings = args.decoder_seq_length
-        warnings.warn(
-            f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length"
-        )
-
-    base_config = core_transformer_config_from_args(get_args())
-    base_config.language_model_type = args.language_model_type
-    base_config.vision_model_type = args.vision_model_type
-    base_config.calculate_per_token_loss = True
-
-    language_config = deepcopy(base_config)
-    language_config = get_language_model_config(language_config)
-
-    if use_te:
-        language_transformer_layer_spec = get_layer_spec_te(
-            is_vit=False
-        )  # TENorm detects LayerNorm/RMS automatically.
-    else:
-        language_transformer_layer_spec = get_layer_spec(
-            is_vit=False, normalization=language_config.normalization
-        )
-
-    vision_config = deepcopy(base_config)
-    vision_config = get_vision_model_config(
-        vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling
-    )
-
-    vision_model_type = args.vision_model_type
-    if vision_model_type in ["clip", "siglip"]:
-        if use_te:
-            vision_transformer_layer_spec = get_layer_spec_te(
-                is_vit=True
-            )  # TENorm detects LayerNorm/RMS automatically.
-        else:
-            vision_transformer_layer_spec = get_layer_spec(
-                is_vit=True, normalization=vision_config.normalization
-            )
-    elif vision_model_type == "internvit":
-        from nvlm.internvit import get_internvit_layer_spec
-        vision_transformer_layer_spec = get_internvit_layer_spec(use_te=use_te)
-    else:
-        raise RuntimeError("unsupported vision model type", vision_model_type)
-
-    vision_projection_config = deepcopy(base_config)
-    vision_projection_config = get_vision_projection_config(
-        vision_projection_config, language_config.hidden_size
-    )
-
-    # --encoder-pipeline-model-parallel-size 1 will enable a separate pipeline stage for the vision model.
-    if args.encoder_pipeline_model_parallel_size > 0:
-        assert (
-            args.encoder_pipeline_model_parallel_size == 1
-        ), "vision model and projection can only live on 1 pipeline stage."
-
-        if args.encoder_tensor_model_parallel_size > 0:
-            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
-            vision_projection_config.tensor_model_parallel_size = (
-                args.encoder_tensor_model_parallel_size
-            )
-
-    # Make sure vision model pipeline parallel size is not inherited from the language model pipeline parallel size.
-    # 0 is not a valid for the config value, hence max(1, ).
-    vision_config.pipeline_model_parallel_size = max(1, args.encoder_pipeline_model_parallel_size)
-    vision_projection_config.pipeline_model_parallel_size = vision_config.pipeline_model_parallel_size
-
-    # Make sure the vision model does not inherit first and last pipeline num layers from the language model.
-    vision_config.first_pipeline_num_layers = vision_config.last_pipeline_num_layers = None
-
-    if vision_projection_config.normalization:
-        vision_projection_layer_spec = get_norm_mlp_module_spec_te().submodules
-    else:
-        vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
-
-    # Toggle --recompute* for the vision and language model separately.
-    if args.recompute_vision:
-        if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
-            vision_config.recompute_num_layers = vision_config.num_layers
-    else:
-        vision_config.recompute_granularity = None
-        vision_config.recompute_method = None
-        vision_config.recompute_num_layers = None
-
-    vision_projection_config.recompute_granularity = None
-    vision_projection_config.recompute_method = None
-    vision_projection_config.recompute_num_layers = None
-
-
-    tokenizer = get_tokenizer()
-    image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
-
-    tile_tags = _get_tile_tags(args, tokenizer)
-
-    model = LLaVAModel(
-        language_transformer_config=language_config,
-        language_transformer_layer_spec=language_transformer_layer_spec,
-        language_vocab_size=args.padded_vocab_size,
-        language_max_sequence_length=args.decoder_seq_length,
-        vision_transformer_config=vision_config,
-        vision_transformer_layer_spec=vision_transformer_layer_spec,
-        drop_vision_class_token=args.disable_vision_class_token,
-        vision_projection_config=vision_projection_config,
-        vision_projection_layer_spec=vision_projection_layer_spec,
-        vision_projection_type="mlp",
-        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
-        parallel_output=parallel_output,
-        language_position_embedding_type=args.position_embedding_type,
-        language_rotary_percent=args.rotary_percent,
-        pre_process=pre_process,
-        post_process=post_process,
-        add_encoder=add_encoder,
-        add_decoder=add_decoder,
-        img_h=args.img_h,
-        img_w=args.img_w,
-        patch_dim=args.patch_dim,
-        language_rotary_base=args.rotary_base,
-        language_rope_scaling=args.use_rope_scaling,
-        image_token_index=image_token_index,
-        pixel_shuffle=args.pixel_shuffle,
-        tile_tags=tile_tags,
-    )
-
-    model.freeze(
-        freeze_language_model=args.freeze_LM,
-        freeze_vision_model=args.freeze_ViT,
-        freeze_vision_projection=False,
-    )
-
-    return model
-
-
-def _get_tile_tags(args, tokenizer):
-    """Tile tags are used in NVLM to surround image tiles with text tags."""
-    if not args.use_tile_tags:
-        return None
-
-    # We expect the tokenized length of the tags is same.
-    thumbnail_tag_text = "<tile_global_thumbnail>"
-    if args.tokenizer_prompt_format == "nvlm-yi-34b":
-        thumbnail_tag_text = "<tile_global>"
-
-    assert args.max_num_tiles <= 6, "Up to 6 tile tags used"
-    tile_tags_text = [f"<tile_{i}>" for i in range(1, args.max_num_tiles + 1)] + [thumbnail_tag_text]
-
-    start_idx = 0
-    if tokenizer._prompt_config.has_bos:
-        start_idx = 1
-
-    # Convert to tokens [num_tiles, tile_seq_len].
-    tile_tags = [tokenizer.tokenize(t)[start_idx:] for t in tile_tags_text]
-
-    return tile_tags
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import warnings
+from copy import deepcopy
+
+import torch
+from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
+from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec, get_norm_mlp_module_spec_te
+
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN, LLaVAModel
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.training import get_args, get_tokenizer, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+
+
+def model_provider(
+    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
+) -> LLaVAModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
+            will live on only a subset of the pipeline stages (specifically, only the first stage).
+        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
+            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
+        parallel_output (bool): Enable parallel model output.
+
+    Returns:
+        model: A multimodal model.
+    """
+    args = get_args()
+    assert args.encoder_pipeline_model_parallel_size <= 1, "LLaVA does not support pp>1 for encoder on it's own pipeline rank"
+
+    use_te = args.use_te
+
+    print_rank_0('building a multimodal model ...')
+
+    num_image_embeddings = get_num_image_embeddings(
+        args.img_h,
+        args.img_w,
+        args.patch_dim,
+        args.vision_model_type,
+        args.disable_vision_class_token,
+        1,
+        args.pixel_shuffle,
+        args.use_tile_tags,
+    )
+    old_seq_length = args.seq_length
+    args.seq_length = args.encoder_seq_length = num_image_embeddings
+    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
+        warnings.warn(
+            f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
+        )
+
+    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
+
+    assert (
+        args.decoder_seq_length is not None
+    ), "Please provide --decoder-seq-length to set the language model sequence length"
+    assert (
+        args.decoder_seq_length > max_num_image_embeddings
+    ), "Language model sequence length must be greater than the maximum number of image embeddings"
+    if args.decoder_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.decoder_seq_length
+        warnings.warn(
+            f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length"
+        )
+
+    base_config = core_transformer_config_from_args(get_args())
+    base_config.language_model_type = args.language_model_type
+    base_config.vision_model_type = args.vision_model_type
+    base_config.calculate_per_token_loss = True
+
+    language_config = deepcopy(base_config)
+    language_config = get_language_model_config(language_config)
+
+    if use_te:
+        # Padding mask needed for SP/CP.
+        padding = args.context_parallel_size > 1 and args.sequence_parallel
+        language_transformer_layer_spec = get_layer_spec_te(
+            is_vit=False, padding=padding
+        )  # TENorm detects LayerNorm/RMS automatically.
+    else:
+        language_transformer_layer_spec = get_layer_spec(
+            is_vit=False, normalization=language_config.normalization
+        )
+
+    vision_model_type = args.vision_model_type
+    vision_config = deepcopy(base_config)
+    vision_config = get_vision_model_config(
+        vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling
+    )
+    if vision_model_type.startswith("huggingface"):
+        assert args.encoder_tensor_model_parallel_size < 2, "Huggingface vision encoders do not support --encoder-tensor-model-parallel-size > 1"
+        assert args.encoder_pipeline_model_parallel_size == 0, "Huggingface vision encoders do not support --encoder-pipeline-model-parallel-size > 0"
+        assert not args.sequence_parallel, "Huggingface models do not support --sequence-parallel"
+        assert args.context_parallel_size < 2, "Huggingface models do not support --context-parallel-size > 1"
+        assert args.vision_huggingface_model_name_or_path is not None, "Providing --vision-huggingface-model-name-or-path is necessary when using huggingface vision model"
+
+        vision_config.huggingface_model_name_or_path = args.vision_huggingface_model_name_or_path
+
+        from transformers import AutoConfig
+        huggingface_config = AutoConfig.from_pretrained(vision_config.huggingface_model_name_or_path)
+        vision_config.hidden_size = huggingface_config.hidden_size
+
+    vision_model_type = args.vision_model_type
+    if vision_model_type in ["clip", "siglip", "radio"]:
+        if use_te:
+            vision_transformer_layer_spec = get_layer_spec_te(
+                is_vit=True
+            )  # TENorm detects LayerNorm/RMS automatically.
+        else:
+            vision_transformer_layer_spec = get_layer_spec(
+                is_vit=True, normalization=vision_config.normalization
+            )
+    elif vision_model_type == "internvit":
+        from nvlm.internvit import get_internvit_layer_spec
+        vision_transformer_layer_spec = get_internvit_layer_spec(use_te=use_te)
+    elif vision_model_type.startswith("huggingface"):
+        vision_transformer_layer_spec = None
+    else:
+        raise RuntimeError("unsupported vision model type", vision_model_type)
+
+    vision_projection_config = deepcopy(base_config)
+
+    if base_config.language_model_type.startswith("huggingface"):
+        assert args.tensor_model_parallel_size == 1, "Huggingface models do not support --tensor-model-parallel-size > 1"
+        assert args.pipeline_model_parallel_size < 2, "Huggingface models do not support --pipeline-model-parallel-size > 1"
+        assert not args.sequence_parallel, "Huggingface models do not support --sequence-parallel"
+        assert args.context_parallel_size < 2, "Huggingface models do not support --context-parallel-size > 1"
+        assert args.language_huggingface_model_name_or_path is not None, "Providing --language-huggingface-model-name-or-path is necessary when using huggingface language model"
+
+        language_config.huggingface_model_name_or_path = args.language_huggingface_model_name_or_path
+        # Pass to vision projection config so can choose the correct ffn hidden size
+        vision_projection_config.huggingface_model_name_or_path = args.language_huggingface_model_name_or_path
+
+    vision_projection_config = get_vision_projection_config(
+        vision_projection_config, language_config.hidden_size
+    )
+
+    # --encoder-pipeline-model-parallel-size 1 will enable a separate pipeline stage for the vision model.
+    if args.encoder_pipeline_model_parallel_size > 0:
+        assert (
+            args.encoder_pipeline_model_parallel_size == 1
+        ), "vision model and projection can only live on 1 pipeline stage."
+
+        if args.encoder_tensor_model_parallel_size > 0:
+            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
+            vision_projection_config.tensor_model_parallel_size = (
+                args.encoder_tensor_model_parallel_size
+            )
+
+    # Make sure vision model pipeline parallel size is not inherited from the language model pipeline parallel size.
+    # 0 is not a valid for the config value, hence max(1, ).
+    vision_config.pipeline_model_parallel_size = max(1, args.encoder_pipeline_model_parallel_size)
+    vision_projection_config.pipeline_model_parallel_size = vision_config.pipeline_model_parallel_size
+
+    # Make sure the vision model does not inherit first and last pipeline num layers from the language model.
+    vision_config.first_pipeline_num_layers = vision_config.last_pipeline_num_layers = None
+
+    if vision_projection_config.normalization:
+        vision_projection_layer_spec = get_norm_mlp_module_spec_te().submodules
+    else:
+        vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
+
+    # Toggle --recompute* for the vision and language model separately.
+    if args.recompute_vision:
+        if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
+            vision_config.recompute_num_layers = vision_config.num_layers
+    else:
+        vision_config.recompute_granularity = None
+        vision_config.recompute_method = None
+        vision_config.recompute_num_layers = None
+
+    vision_projection_config.recompute_granularity = None
+    vision_projection_config.recompute_method = None
+    vision_projection_config.recompute_num_layers = None
+
+    # TODO: Vision model and projection do not use SP/CP yet.
+    vision_config.sequence_parallel = False
+    vision_config.context_parallel_size = 1
+    vision_config.tp_comm_overlap = False
+
+    vision_projection_config.sequence_parallel = False
+    vision_projection_config.context_parallel_size = 1
+    vision_projection_config.tp_comm_overlap = False
+
+    tokenizer = get_tokenizer()
+    image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+    assert image_token_index is not None, f"IMAGE_TOKEN={IMAGE_TOKEN} needs to be added using the --special-tokens arg."
+
+    tile_tags = _get_tile_tags(args, tokenizer)
+
+    model = LLaVAModel(
+        language_transformer_config=language_config,
+        language_transformer_layer_spec=language_transformer_layer_spec,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.decoder_seq_length,
+        vision_transformer_config=vision_config,
+        vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.disable_vision_class_token,
+        vision_projection_config=vision_projection_config,
+        vision_projection_layer_spec=vision_projection_layer_spec,
+        vision_projection_type="mlp",
+        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
+        parallel_output=parallel_output,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
+        pre_process=pre_process,
+        post_process=post_process,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        img_h=args.img_h,
+        img_w=args.img_w,
+        patch_dim=args.patch_dim,
+        language_rotary_base=args.rotary_base,
+        language_rope_scaling=args.use_rope_scaling,
+        image_token_index=image_token_index,
+        pixel_shuffle=args.pixel_shuffle,
+        tile_tags=tile_tags,
+    )
+
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+
+    return model
+
+
+def _get_tile_tags(args, tokenizer):
+    """Tile tags are used in NVLM to surround image tiles with text tags."""
+    if not args.use_tile_tags:
+        return None
+
+    # We expect the tokenized length of the tags is same.
+    thumbnail_tag_text = "<tile_global_thumbnail>"
+    if args.tokenizer_prompt_format == "nvlm-yi-34b":
+        thumbnail_tag_text = "<tile_global>"
+
+    assert args.max_num_tiles <= 6, "Up to 6 tile tags used"
+    tile_tags_text = [f"<tile_{i}>" for i in range(1, args.max_num_tiles + 1)] + [thumbnail_tag_text]
+
+    start_idx = 0
+    if tokenizer._prompt_config.has_bos:
+        start_idx = 1
+
+    # Convert to tokens [num_tiles, tile_seq_len].
+    tile_tags = [tokenizer.tokenize(t)[start_idx:] for t in tile_tags_text]
+
+    return tile_tags
--- a/examples/multimodal/model_converter/internvit_converter.py
+++ b/examples/multimodal/model_converter/internvit_converter.py
-import argparse
-import os
-
-import torch
-from transformers import AutoModel
-
-
-def convert(model_name, output_path, tensor_parallel_size, use_te):
-    """Convert InternViT HF checkpoint to mcore."""
-    hf_model = AutoModel.from_pretrained(
-        model_name,
-        trust_remote_code=True
-    )
-
-    hf_state_dict = hf_model.state_dict()
-    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
-
-    hidden_size = 3200
-    num_heads = 25
-    dim = 128
-
-    order = torch.ones(3 * hidden_size).long()
-
-    for j in range(num_heads):
-        for i in range(dim):
-            order[i + dim*3*j] = j*dim+i
-            order[dim + i + dim*3*j] = j*dim+i+num_heads*dim
-            order[dim*2 + i + dim*3*j] = j*dim+i+num_heads*dim*2
-
-    for name, tensor in hf_state_dict.items():
-        # Map parameter names to ones used in megatron.
-        new_name = ""
-        new_tensor = tensor
-
-        # This is used for chunking some tensors to target tensor parallel size.
-        chunk_dim = None
-
-        if "embeddings.class_embedding" in name:
-            new_name = "class_token"
-        elif "embeddings.patch_embedding.weight" in name:
-            new_name = "conv1.weight"
-        elif "embeddings.patch_embedding.bias" in name:
-            new_name = "conv1.bias"
-        elif "embeddings.position_embedding" in name:
-            new_name = "position_embeddings.weight"
-            new_tensor = new_tensor.squeeze(0)
-        elif "encoder.layers" in name:
-            layer_idx = name.split(".")[2]
-
-            base = f"decoder.layers.{layer_idx}"
-
-            head_dim = 128
-
-            if tensor_parallel_size == 1:
-                num_padded_heads = 25
-            elif tensor_parallel_size == 8:
-                # Note: 25 is not divisible by 8 and we don't currently support uneven heads split with tensor parallelism.
-                # So we pad with dummy all-zero heads. Please use a nice even number of attention heads in your model.
-                num_padded_heads = 32
-            else:
-                raise NotImplementedError("invalid tensor parallel size value:", tensor_parallel_size)
-
-            if "ls1" in name:
-                new_name = f"{base}.ls1"
-            elif "ls2" in name:
-                new_name = f"{base}.ls2"
-            elif "attn.qkv.weight" in name:
-                new_name = f"{base}.self_attention.linear_qkv.weight"
-                num_tensors = 3
-                padded_dim = head_dim * num_padded_heads * num_tensors
-                padded_tensor = torch.zeros((padded_dim, new_tensor.shape[-1]), dtype=new_tensor.dtype, device=new_tensor.device)
-                padded_tensor[:new_tensor.shape[0], :] = new_tensor[order]
-                new_tensor = padded_tensor
-                chunk_dim = 0
-            elif "attn.q_norm.weight" in name:
-                new_name = f"{base}.self_attention.q_layernorm.weight"
-                num_tensors = 1
-                padded_dim = head_dim * num_padded_heads * num_tensors
-                padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
-                padded_tensor[:new_tensor.shape[0]] = new_tensor
-                new_tensor = padded_tensor
-                chunk_dim = 0
-            elif "attn.k_norm.weight" in name:
-                new_name = f"{base}.self_attention.k_layernorm.weight"
-                num_tensors = 1
-                padded_dim = head_dim * num_padded_heads * num_tensors
-                padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
-                padded_tensor[:new_tensor.shape[0]] = new_tensor
-                new_tensor = padded_tensor
-                chunk_dim = 0
-            elif "attn.proj.weight" in name:
-                new_name = f"{base}.self_attention.linear_proj.weight"
-                num_tensors = 1
-                padded_dim = head_dim * num_padded_heads * num_tensors
-                padded_tensor = torch.zeros((new_tensor.shape[0], padded_dim), dtype=new_tensor.dtype, device=new_tensor.device)
-                padded_tensor[:, :new_tensor.shape[-1]] = new_tensor
-                new_tensor = padded_tensor
-                chunk_dim = 1
-            elif "attn.proj.bias" in name:
-                new_name = f"{base}.self_attention.linear_proj.bias"
-            elif "mlp.fc1.weight" in name:
-                new_name = f"{base}.mlp.linear_fc1.weight"
-                chunk_dim = 0
-            elif "mlp.fc1.bias" in name:
-                new_name = f"{base}.mlp.linear_fc1.bias"
-                chunk_dim = 0
-            elif "mlp.fc2.weight" in name:
-                new_name = f"{base}.mlp.linear_fc2.weight"
-                chunk_dim = 1
-            elif "mlp.fc2.bias" in name:
-                new_name = f"{base}.mlp.linear_fc2.bias"
-            elif "norm1" in name:
-                new_name = f"{base}.input_layernorm.weight"
-            elif "norm2" in name:
-                new_name = f"{base}.pre_mlp_layernorm.weight"
-            else:
-                raise RuntimeError("unexpected transformer layer name", name)
-        else:
-            raise RuntimeError("unexpected layer name", name)
-
-        assert new_name != "", f"unexpected layer name {name}"
-
-        # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
-        extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
-        is_extra_state_layer = any([l in new_name for l in extra_state_layers])
-        if use_te and is_extra_state_layer:
-            layer = new_name.split(".")[-2]
-            if layer in extra_state_layers:
-                extra_state_name = (
-                    new_name[: new_name.rfind(".") + 1] + "_extra_state"
-                )  # Replace the weight name.
-                for i in range(tensor_parallel_size):
-                    new_state_dicts[i]["model"][extra_state_name] = None
-
-        if chunk_dim is None:
-            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
-        else:
-            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
-
-        for i in range(tensor_parallel_size):
-            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
-
-    for i in range(tensor_parallel_size):
-        output_dir_tp = os.path.join(output_path, f"iter_0000001/mp_rank_0{i}")
-        os.makedirs(output_dir_tp, exist_ok=True)
-        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
-        torch.save(new_state_dicts[i], output_path_tp)
-        print("saved file", output_path_tp)
-
-    print("done")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="InternVIT HuggingFace to Mcore converter")
-    parser.add_argument("--model-name", type=str, default="OpenGVLab/InternViT-6B-448px-V1-5", help="Model name in HuggingFace")
-    parser.add_argument("--output-dir", type=str, required=True, help="Output directory for the mcore model.")
-    parser.add_argument("--use-te", action="store_true", default=True)
-    parser.add_argument("--tensor-parallel-size", type=int, required=True)
-
-    args = parser.parse_args()
-
-    convert(args.model_name, args.output_dir, args.tensor_parallel_size, args.use_te)
+import argparse
+import os
+
+import torch
+from transformers import AutoModel
+
+
+def convert(model_name, output_path, tensor_parallel_size, use_te):
+    """Convert InternViT HF checkpoint to mcore."""
+    hf_model = AutoModel.from_pretrained(
+        model_name,
+        trust_remote_code=True
+    )
+
+    hf_state_dict = hf_model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+    hidden_size = 3200
+    num_heads = 25
+    dim = 128
+
+    order = torch.ones(3 * hidden_size).long()
+
+    for j in range(num_heads):
+        for i in range(dim):
+            order[i + dim*3*j] = j*dim+i
+            order[dim + i + dim*3*j] = j*dim+i+num_heads*dim
+            order[dim*2 + i + dim*3*j] = j*dim+i+num_heads*dim*2
+
+    for name, tensor in hf_state_dict.items():
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+
+        if "embeddings.class_embedding" in name:
+            new_name = "class_token"
+        elif "embeddings.patch_embedding.weight" in name:
+            new_name = "conv1.weight"
+        elif "embeddings.patch_embedding.bias" in name:
+            new_name = "conv1.bias"
+        elif "embeddings.position_embedding" in name:
+            new_name = "position_embeddings.weight"
+            new_tensor = new_tensor.squeeze(0)
+        elif "encoder.layers" in name:
+            layer_idx = name.split(".")[2]
+
+            base = f"decoder.layers.{layer_idx}"
+
+            head_dim = 128
+
+            if tensor_parallel_size == 1:
+                num_padded_heads = 25
+            elif tensor_parallel_size == 8:
+                # Note: 25 is not divisible by 8 and we don't currently support uneven heads split with tensor parallelism.
+                # So we pad with dummy all-zero heads. Please use a nice even number of attention heads in your model.
+                num_padded_heads = 32
+            else:
+                raise NotImplementedError("invalid tensor parallel size value:", tensor_parallel_size)
+
+            if "ls1" in name:
+                new_name = f"{base}.ls1"
+            elif "ls2" in name:
+                new_name = f"{base}.ls2"
+            elif "attn.qkv.weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                num_tensors = 3
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros((padded_dim, new_tensor.shape[-1]), dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0], :] = new_tensor[order]
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.q_norm.weight" in name:
+                new_name = f"{base}.self_attention.q_layernorm.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.k_norm.weight" in name:
+                new_name = f"{base}.self_attention.k_layernorm.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros((new_tensor.shape[0], padded_dim), dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:, :new_tensor.shape[-1]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 1
+            elif "attn.proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "mlp.fc1.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.fc1.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.fc2.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.fc2.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "norm1" in name:
+                new_name = f"{base}.input_layernorm.weight"
+            elif "norm2" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+            else:
+                raise RuntimeError("unexpected transformer layer name", name)
+        else:
+            raise RuntimeError("unexpected layer name", name)
+
+        assert new_name != "", f"unexpected layer name {name}"
+
+        # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+        extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+        is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+        if use_te and is_extra_state_layer:
+            layer = new_name.split(".")[-2]
+            if layer in extra_state_layers:
+                extra_state_name = (
+                    new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                )  # Replace the weight name.
+                for i in range(tensor_parallel_size):
+                    new_state_dicts[i]["model"][extra_state_name] = None
+
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+        for i in range(tensor_parallel_size):
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, f"iter_0000001/mp_rank_0{i}")
+        os.makedirs(output_dir_tp, exist_ok=True)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+        print("saved file", output_path_tp)
+
+    print("done")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="InternVIT HuggingFace to Mcore converter")
+    parser.add_argument("--model-name", type=str, default="OpenGVLab/InternViT-6B-448px-V1-5", help="Model name in HuggingFace")
+    parser.add_argument("--output-dir", type=str, required=True, help="Output directory for the mcore model.")
+    parser.add_argument("--use-te", action="store_true", default=True)
+    parser.add_argument("--tensor-parallel-size", type=int, required=True)
+
+    args = parser.parse_args()
+
+    convert(args.model_name, args.output_dir, args.tensor_parallel_size, args.use_te)