首次上传

d444a97a · yangzhong · d444a97a · d444a97a · d444a97a · d444a97a
Commit d444a97a authored Oct 30, 2025 by yangzhong
20 changed files
--- a/examples/multimodal/evaluation/evaluate_ocrbench.py
+++ b/examples/multimodal/evaluation/evaluate_ocrbench.py
+import argparse
+import json
+from evaluate_mmmu import get_input_output_paths
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+                results[sample_id] = res
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+    return output_file_path
+def compute_ocrbench_score(result_file):
+    """Compute OCRBench score."""
+    merged_results = json.load(open(result_file))
+    # OCRBench score calculation is adopted from https://github.com/Yuliang-Liu/MultimodalOCR/blob/1b7713f44c91f30f64efb6d3e494c416861ef15f/example.py#L1
+    # MIT License. Copyright (c) 2023 Yuliang Liu
+    score = {
+        "Regular Text Recognition": 0,
+        "Irregular Text Recognition": 0,
+        "Artistic Text Recognition": 0,
+        "Handwriting Recognition": 0,
+        "Digit String Recognition": 0,
+        "Non-Semantic Text Recognition": 0,
+        "Scene Text-centric VQA": 0,
+        "Doc-oriented VQA": 0,
+        "Doc-oriented VQA": 0,
+        "Key Information Extraction": 0,
+        "Handwritten Mathematical Expression Recognition": 0,
+    }
+    for res in merged_results:
+        predict = res["answer"]
+        answers = res["gt_answer"]
+        dataset_name = res["dataset_name"]
+        ocr_type = res["data_type"]
+        if dataset_name == "HME100k":
+            if isinstance(answers, list):
+                for j in range(len(answers)):
+                    answer = answers[j].strip().replace("\n", " ").replace(" ", "")
+                    predict = predict.strip().replace("\n", " ").replace(" ", "")
+                    if answer in predict:
+                        score[ocr_type] += 1
+            else:
+                answers = answers.strip().replace("\n", " ").replace(" ", "")
+                predict = predict.strip().replace("\n", " ").replace(" ", "")
+                if answers in predict:
+                    score[ocr_type] += 1
+        else:
+            if isinstance(answers, list):
+                for j in range(len(answers)):
+                    answer = answers[j].lower().strip().replace("\n", " ")
+                    predict = predict.lower().strip().replace("\n", " ")
+                    if answer in predict:
+                        score[ocr_type] += 1
+            else:
+                answers = answers.lower().strip().replace("\n", " ")
+                predict = predict.lower().strip().replace("\n", " ")
+                if answers in predict:
+                    score[ocr_type] += 1
+    recognition_score = (
+        score['Regular Text Recognition']
+        + score['Irregular Text Recognition']
+        + score['Artistic Text Recognition']
+        + score['Handwriting Recognition']
+        + score['Digit String Recognition']
+        + score['Non-Semantic Text Recognition']
+    )
+    final_score = (
+        recognition_score
+        + score['Scene Text-centric VQA']
+        + score['Doc-oriented VQA']
+        + score['Key Information Extraction']
+        + score['Handwritten Mathematical Expression Recognition']
+    )
+    result_log = f"""###########################OCRBench##############################
+Text Recognition(Total 300): {recognition_score}
+------------------Details of Recognition Score-------------------
+Regular Text Recognition(Total 50): {score['Regular Text Recognition']}
+Irregular Text Recognition(Total 50): {score['Irregular Text Recognition']}
+Artistic Text Recognition(Total 50): {score['Artistic Text Recognition']}
+Handwriting Recognition(Total 50): {score['Handwriting Recognition']}
+Digit String Recognition(Total 50): {score['Digit String Recognition']}
+Non-Semantic Text Recognition(Total 50): {score['Non-Semantic Text Recognition']}
+----------------------------------------------------------------
+Scene Text-centric VQA(Total 200): {score['Scene Text-centric VQA']}
+----------------------------------------------------------------
+Doc-oriented VQA(Total 200): {score['Doc-oriented VQA']}
+----------------------------------------------------------------
+Key Information Extraction(Total 200): {score['Key Information Extraction']}
+----------------------------------------------------------------
+Handwritten Mathematical Expression Recognition(Total 100): {score['Handwritten Mathematical Expression Recognition']}
+----------------------Final Score-------------------------------
+Final Score(Total 1000): {final_score}"""
+    return result_log, final_score
+def ocrbench_eval(input_path):
+    """Run OCRBench evaluation."""
+    result_file_path = merge_input_files(input_path)
+    result_log, score = compute_ocrbench_score(result_file_path)
+    return result_log, score
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    result_log, _ = ocrbench_eval(args.input_path)
+    print(result_log)
--- a/examples/multimodal/evaluation/evaluate_textvqa.py
+++ b/examples/multimodal/evaluation/evaluate_textvqa.py
+import argparse
+import json
+from evaluate_mmmu import get_input_output_paths
+from evaluate_vqav2 import compute_vqa_accuracy
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+                results[sample_id] = {
+                    "question_id": sample_id,
+                    "answer": res["answer"],
+                    "gt_answer": res["gt_answer"],
+                }
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+    return output_file_path
+def textvqa_eval(input_path):
+    """Run TextVQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file_path, task="TextVQA")
+    return avg_acc
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    avg_acc = textvqa_eval(args.input_path)
+    print(f"===== TextVQA Accuracy {avg_acc:.2f}% =====")
--- a/examples/multimodal/evaluation/evaluate_vqav2.py
+++ b/examples/multimodal/evaluation/evaluate_vqav2.py
+import argparse
+import json
+from evaluate_mmmu import get_input_output_paths
+from open_flamingo.eval.vqa_metric import VQAEval
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
+    results = dict()
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                sample_id = res["sample_id"]
+                # Skip possible duplicates.
+                if sample_id in results:
+                    continue
+                res["question_id"] = sample_id
+                results[sample_id] = res
+    results = list(results.values())
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+    return output_file_path
+def is_number(n: str):
+    """Check if input is a number."""
+    try:
+        float(n)
+        return True
+    except ValueError:
+        return False
+def compute_vqa_accuracy(result_file, task):
+    """Compute VQA accuracy."""
+    merged_results = json.load(open(result_file))
+    vqa = VQAEval(vqa=None, vqaRes=None)
+    all_acc = []
+    for res in merged_results:
+        pred = res["answer"]
+        pred = vqa.processPunctuation(pred)
+        pred = vqa.processDigitArticle(pred)
+        gt = res["gt_answer"]
+        gt = [vqa.processPunctuation(ans) for ans in gt]
+        gt = [vqa.processDigitArticle(ans) for ans in gt]
+        # ChartQA uses relaxed accuracy:
+        # "We consider an answer to be correct if it is within 5% of the gold answer.
+        #  For non-numeric answers, we still need an exact match to consider an answer to be correct."
+        if task == "ChartQA":
+            acc = 0.0
+            assert len(gt) == 1, "expected exactly one groundtruth answer."
+            gt = gt[0]
+            pred = pred.rstrip("%")
+            gt = gt.rstrip("%")
+            if is_number(pred) and is_number(gt):
+                pred = float(pred)
+                gt = float(gt)
+                if pred >= (gt * 0.95) and pred <= (gt * 1.05):
+                    acc = 1.0
+            elif pred == gt:
+                acc = 1.0
+            all_acc.append(acc)
+        elif task in ("VQAv2", "TextVQA"):
+            num_match = sum([pred == ans for ans in gt])
+            acc = min(1.0, num_match / 3.0)
+            all_acc.append(acc)
+        elif task == "AI2D":
+            assert len(gt) == 1, f"Expected exactly 1 GT, got {gt}"
+            acc = pred == gt[0]
+            all_acc.append(acc)
+        else:
+            raise NotImplementedError(f"unknown task {task}")
+    acc_avg = sum(all_acc) / len(all_acc) * 100
+    return acc_avg
+def vqav2_eval(input_path):
+    """Run VQAv2 evaluation."""
+    result_file = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file, task="VQAv2")
+    return avg_acc
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+    avg_acc = vqav2_eval(args.input_path)
+    print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====")
--- a/examples/multimodal/evaluation/evaluation_datasets.py
+++ b/examples/multimodal/evaluation/evaluation_datasets.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Evaluation datasets."""
+import glob
+import itertools
+import json
+import os
+import re
+from collections import defaultdict
+import numpy as np
+import torch
+from image_processing import get_visual_transform
+from PIL import Image
+from megatron.training import print_rank_0
+def _get_partition_bounds(
+    total_num_samples, num_samples_per_partition, num_partitions, partition_id
+):
+    if num_samples_per_partition == 0:
+        samples_per_partition = [
+            int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
+        ]
+        return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
+    return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
+class VQADataset(torch.utils.data.Dataset):
+    """VQA evaluation dataset."""
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        keys,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        samples = json.load(open(gt_path, encoding='utf-8'))
+        if "data" in samples:
+            samples = samples["data"]
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(samples), num_samples_per_partition, num_partitions, partition_id
+            )
+            samples = samples[lb:ub]
+        self._keys = keys
+        self._samples = samples
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+    def __len__(self):
+        return len(self._samples)
+    def __getitem__(self, idx):
+        sample = self._samples[idx]
+        img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
+        if not os.path.exists(img_file):
+            img_file += ".jpg"
+            if not os.path.exists(img_file):
+                img_file = img_file.replace('.jpg', '.png')
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+        sample_id = idx
+        if "sample_id" in self._keys:
+            sample_id = sample[self._keys["sample_id"]]
+        metadata = ""  # Not used.
+        return (
+            torch.stack(imgs),
+            tile_count,
+            sample_id,
+            sample[self._keys["question"]],
+            sample[self._keys["answer"]],
+            metadata,
+        )
+class CaptioningDataset(torch.utils.data.Dataset):
+    """Captioning evaluation dataset."""
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        image_files = sorted(glob.glob(input_image_path + "/*"))
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(image_files), num_samples_per_partition, num_partitions, partition_id
+            )
+            image_files = image_files[lb:ub]
+        gts = json.load(open(gt_path))
+        answers = defaultdict(list)
+        for gt in gts["annotations"]:
+            answers[gt["image_id"]].append(gt['caption'])
+        self._image_files = image_files
+        self._answers = answers
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+    def __len__(self):
+        return len(self._image_files)
+    def __getitem__(self, idx):
+        img_file = self._image_files[idx]
+        image_id = int(img_file.split("_")[-1].split(".")[0])
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+        question = ""  # Fixed for all samples.
+        metadata = ""  # Not used.
+        return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
+class MMMUDataset(torch.utils.data.Dataset):
+    """MMMU evaluation dataset."""
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        prompt_style,
+        vision_model_type,
+    ):
+        import datasets
+        from MMMU.mmmu.utils.data_utils import CAT_SHORT2LONG, load_yaml
+        # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
+        all_mmmu_datasets = []
+        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+        for subject in CAT_SHORT2LONG.values():
+            # Use a local copy of the dataset if exists (can be faster) or the HF one.
+            if os.path.exists(input_image_path):
+                subject_dataset = datasets.load_dataset(
+                    os.path.join(input_image_path, subject),
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                    verification_mode="no_checks",
+                )
+            else:
+                subject_dataset = datasets.load_dataset(
+                    "MMMU/MMMU",
+                    subject,
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                )
+            all_mmmu_datasets.append(subject_dataset)
+        dataset = datasets.concatenate_datasets(all_mmmu_datasets)
+        dataset = [s for s in dataset if s['id'].startswith("val")]
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(dataset), num_samples_per_partition, num_partitions, partition_id
+            )
+            dataset = dataset[lb:ub]
+        # Using the LLaVA config from the MMMU repo.
+        config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
+        for k, v in config.items():
+            if isinstance(v, list):
+                assert len(v) == 1, "only one value supported."
+                config[k] = v[0]
+        self._config = config
+        self._dataset = dataset
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._prompt_style = prompt_style
+        self._vision_model_type = vision_model_type
+    def __len__(self):
+        return len(self._dataset)
+    def __getitem__(self, idx):
+        from MMMU.mmmu.utils.data_utils import construct_prompt, process_single_sample
+        sample = self._dataset[idx]
+        # Use the single image approach from the MMMU repo.
+        if self._prompt_style == "single_image":
+            sample = process_single_sample(sample)
+            sample = construct_prompt(sample, self._config)
+            img = sample["image"]
+            sample_imgs = get_visual_transform(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+                vision_model_type=self._vision_model_type,
+            )
+            sample_num_tiles = [len(sample_imgs)]
+            prompt = sample["final_input_prompt"]
+            for i in range(8):
+                prompt = prompt.replace(f"<image {i}>", "")
+            sample["final_input_prompt"] = f"<image>\n{prompt}"
+        elif self._prompt_style == "vlmevalkit":
+            sample = construct_prompt(sample, self._config)
+            if sample["question_type"] == "multiple-choice":
+                question = sample["question"]
+                options = ""
+                for k, v in sample["index2ans"].items():
+                    options += f"{k}. {v}\n"
+                final_prompt = f"{question}\n"
+                if "hint" in sample:
+                    final_prompt += f"Hint: {sample['hint']}\n"
+                if "task_instructions" in sample:
+                    final_prompt += f"Task instructions: {sample['task_instructions']}\n"
+                final_prompt += options
+                final_prompt += "Answer with the option's letter from the given choices directly."
+                sample["final_input_prompt"] = final_prompt.rstrip()
+            else:
+                question = sample["question"]
+                final_prompt = f"{question}\n"
+                final_prompt += "Answer the question directly."
+                sample["final_input_prompt"] = final_prompt.rstrip()
+            sample_imgs = []
+            sample_num_tiles = []
+            img_indices = sorted(list(set(re.findall(r"<image (\d+)", sample["final_input_prompt"]))))
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+            adjusted_max_num_tiles = min(adjusted_max_num_tiles, self._max_num_tiles)
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )  # List of tiles.
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+            sample["final_input_prompt"] = " ".join([f'<image {i + 1}><image>' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
+        elif self._prompt_style == "multi_image":
+            sample = construct_prompt(sample, self._config)
+            sample_imgs = []
+            sample_num_tiles = []
+            img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+                # Note: Only replace the current image tag.
+                sample["final_input_prompt"] = sample["final_input_prompt"].replace(
+                    img_str, "<image>", 1
+                )
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )  # List of tiles.
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+            # Sanity check.
+            for i in range(1, 8):
+                assert (
+                    f"<image {i}>" not in sample["final_input_prompt"]
+                ), "prompt contains unhandled image tags"
+        else:
+            raise ValueError(f"unknown prompt style {self._prompt_style}")
+        # MMMU specific metadata.
+        metadata = {"question_type": sample["question_type"]}
+        if sample["question_type"] == "multiple-choice":
+            metadata["index2ans"] = sample["index2ans"]
+            metadata["all_choices"] = sample["all_choices"]
+        prompt = sample['final_input_prompt']
+        tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
+        return (
+            torch.stack(sample_imgs),
+            tile_count,
+            sample["id"],
+            prompt,
+            sample["answer"],
+            metadata,
+        )
+class VideoMMMEDataset(torch.utils.data.Dataset):
+    "Video MME evaluation dataset."
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        num_frames,
+        vision_model_type,
+    ):
+        ground_truth_original = json.load(open(gt_path))
+        ground_truth = []
+        for gt in ground_truth_original:
+            video_path = gt["url"]
+            video_path = video_path.replace("https://www.youtube.com/watch?v=", "")
+            video_path = video_path.replace("https://m.youtube.com/watch?v=", "")
+            video_path = os.path.join(input_image_path, video_path + ".mp4")
+            if not os.path.exists(video_path):
+                continue
+            gt["video_path"] = video_path
+            ground_truth.append(gt)
+        ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
+        print_rank_0(f"Found {len(ground_truth)} videos to process.")
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(ground_truth), num_samples_per_partition, num_partitions, partition_id
+            )
+            ground_truth = ground_truth[start_idx:end_idx]
+        self._ground_truth = ground_truth
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._num_frames = num_frames
+        self._vision_model_type = vision_model_type
+    def __len__(self):
+        return len(self._ground_truth)
+    def __getitem__(self, idx):
+        from torchvision.io import read_video
+        gt = self._ground_truth[idx]
+        video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
+        video = video.numpy()
+        selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
+        video_frames = video[selected_frames]
+        if self._num_frames == 1:
+            video_frames = video_frames[None]
+        imgs = list(
+            itertools.chain.from_iterable(
+                get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    self._max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )
+                for img in video_frames
+            )
+        )
+        for question in gt["questions"]:
+            # Very hacky, but we essentially re-create gt holding only the
+            # question of interest. This is the make this generation script
+            # compatible with the Video MME evaluation script.
+            question_dict = {
+                "video_id": gt["video_id"],
+                "duration_category": gt["duration_category"],
+                "video_category": gt["video_category"],
+                "video_subcategory": gt["video_subcategory"],
+                "url": gt["url"],
+                "questions": [question],
+            }
+        num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
+        answer = ""
+        metadata = ""
+        return (
+            torch.stack(imgs),
+            num_tiles,
+            question["question_id"],
+            question_dict,
+            answer,
+            metadata,
+        )
+class OCRBenchDataset(torch.utils.data.Dataset):
+    """OCRBench evaluation dataset."""
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        gt = json.load(open(gt_path, encoding='utf-8'))
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(gt), num_samples_per_partition, num_partitions, partition_id
+            )
+            gt = gt[start_idx:end_idx]
+        self._input_image_path = input_image_path
+        self._gt = gt
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+    def __len__(self):
+        return len(self._gt)
+    def __getitem__(self, idx):
+        img_path = os.path.join(self._input_image_path, self._gt[idx]['image_path'])
+        img = Image.open(img_path)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+        metadata = {
+            "dataset_name": self._gt[idx]["dataset_name"],
+            "data_type": self._gt[idx]["type"],
+        }
+        return (
+            torch.stack(imgs),
+            tile_count,
+            idx,
+            self._gt[idx]["question"],
+            self._gt[idx]["answers"],
+            metadata,
+        )
+class MathVistaDataset(torch.utils.data.Dataset):
+    """MathVista evaluation dataset."""
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        vision_model_type,
+    ):
+        import datasets
+        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+        if os.path.exists(input_image_path):
+            dataset = datasets.load_dataset(
+                input_image_path, cache_dir=hf_datasets_cache, verification_mode="no_checks"
+            )
+        else:
+            dataset = datasets.load_dataset(
+                "AI4Math/MathVista", split="testmini", cache_dir=hf_datasets_cache
+            )
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(dataset), num_samples_per_partition, num_partitions, partition_id
+            )
+            dataset = dataset[start_idx:end_idx]
+        self._dataset = dataset
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._vision_model_type = vision_model_type
+    def __len__(self):
+        return len(self._dataset["pid"])
+    def __getitem__(self, idx):
+        # Already a PIL object.
+        img = self._dataset['decoded_image'][idx]
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+        question_id = self._dataset["pid"][idx]
+        question = self._dataset["question"][idx]
+        question_type = self._dataset["question_type"][idx]  # free_form or multi_choice
+        query = self._dataset["query"][idx]
+        choices = self._dataset["choices"][idx]
+        answer = self._dataset["answer"][idx]
+        if question_type == 'multi_choice':
+            start_chr = 'A'
+            choices_str = ''
+            index2ans = {}
+            all_choices = []
+            for choice in choices:
+                all_choices.append(start_chr)
+                index2ans[start_chr] = choice
+                choices_str += f"{start_chr}. {choice}\n"
+                start_chr = chr(ord(start_chr) + 1)
+            question = question + '\n' + choices_str
+            question = question + "Answer with the option's letter from the given choices directly."
+            answer = chr(ord('A') + choices.index(answer))
+        else:
+            question = query.replace("Hint: ", "")
+            index2ans = {}
+            all_choices = []
+        metadata = {
+            "question_type": question_type,
+            "index2ans": index2ans,
+            "all_choices": all_choices,
+        }
+        return torch.stack(imgs), tile_count, question_id, question, answer, metadata
+class AI2DDataset(torch.utils.data.Dataset):
+    """AI2D evaluation dataset."""
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        no_mask,
+        vision_model_type,
+    ):
+        with open(gt_path, 'r') as f:
+            jsonl = list(f)
+        gt = [json.loads(json_str) for json_str in jsonl]
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(gt), num_samples_per_partition, num_partitions, partition_id
+            )
+            gt = gt[start_idx:end_idx]
+        self._gt = gt
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._no_mask = no_mask
+        self._vision_model_type = vision_model_type
+    def __len__(self):
+        return len(self._gt)
+    def __getitem__(self, idx):
+        img_path = os.path.join(self._input_image_path, self._gt[idx]['image'])
+        if self._no_mask:
+            img_path.replace("AI2D_TEST", "AI2D_TEST_NO_MASK_IMAGES")
+        img = Image.open(img_path)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+            vision_model_type=self._vision_model_type,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+        metadata = ""  # Not used.
+        return (
+            torch.stack(imgs),
+            tile_count,
+            self._gt[idx]["question_id"],
+            self._gt[idx]["question"],
+            self._gt[idx]["answer"],
+            metadata,
+        )
+def get_evaluation_dataset(
+    task,
+    input_image_path,
+    gt_path,
+    img_h,
+    img_w,
+    use_tiling,
+    max_num_tiles,
+    use_thumbnail,
+    num_samples_per_partition,
+    num_partitions,
+    partition_id,
+    num_frames,
+    vision_model_type,
+):
+    """Get an evaluation dataset."""
+    if task == "TextVQA":
+        keys = {
+            "image_id": "image_id",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answers",
+        }
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "VQAv2":
+        keys = {
+            "image_id": "image",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answer",
+        }
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "ChartQA":
+        keys = {"image_id": "imgname", "question": "query", "answer": "label"}
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "captioning":
+        dataset = CaptioningDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == 'MMMU':
+        # Note:
+        # - prompt_style="single_image" uses only one image like in the MMMU repo example.
+        # - prompt_style="multi_image" uses multiple input images.
+        # - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
+        dataset = MMMUDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            prompt_style="single_image",
+            vision_model_type=vision_model_type,
+        )
+    elif task == "VideoMME":
+        dataset = VideoMMMEDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            num_frames,
+            vision_model_type,
+        )
+    elif task == "OCRBench":
+        dataset = OCRBenchDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "MathVista":
+        dataset = MathVistaDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            vision_model_type,
+        )
+    elif task == "AI2D":
+        dataset = AI2DDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            no_mask=False,
+            vision_model_type=vision_model_type,
+        )
+    else:
+        raise NotImplementedError(f"unsupported task {task}")
+    return dataset
--- a/examples/multimodal/image_processing.py
+++ b/examples/multimodal/image_processing.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved. Except portions as noted which are Copyright (c) 2023 OpenGVLab and licensed under the MIT license found in LICENSE.
+from torchvision import transforms as T
+from torchvision.transforms import Compose
+from torchvision.transforms.functional import InterpolationMode
+IMAGENET_PIXEL_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_PIXEL_STD = [0.229, 0.224, 0.225]
+SIGLIP_PIXEL_MEAN = [0.5, 0.5, 0.5]
+SIGLIP_PIXEL_STD = [0.5, 0.5, 0.5]
+CLIP_PIXEL_MEAN = [0.48145466, 0.4578275, 0.40821073]
+CLIP_PIXEL_STD = [0.26862954, 0.26130258, 0.27577711]
+pixel_statistics = {
+    "clip": (CLIP_PIXEL_MEAN, CLIP_PIXEL_STD),
+    "siglip": (SIGLIP_PIXEL_MEAN, SIGLIP_PIXEL_STD),
+    "internvit": (IMAGENET_PIXEL_MEAN, IMAGENET_PIXEL_STD),
+}
+def get_visual_transform(img, img_h, img_w, use_tiling=False, max_num_tiles=1, use_thumbnail=False, augment=False, vision_model_type="clip"):
+    pixel_mean, pixel_std = pixel_statistics[vision_model_type]
+    assert not augment, "Image augmentation not implemented."
+    transform = build_transform(img_h, pixel_mean, pixel_std, vision_model_type)
+    if use_tiling:
+        assert img_h == img_w, "dynamic tiling expects equal tile height and width"
+        imgs = dynamic_preprocess(img, min_num=1, max_num=max_num_tiles, image_size=img_h, use_thumbnail=use_thumbnail)
+        imgs = [transform(img) for img in imgs]
+    else:
+        imgs = [transform(img)]
+    return imgs
+# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L685
+# Copyright (c) 2023 OpenGVLab.
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L702
+# Copyright (c) 2023 OpenGVLab.
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+# Based on https://github.com/openai/CLIP/blob/dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1/clip/clip.py#L79
+# and https://github.com/OpenGVLab/InternVL/blob/aa521e6eb1df4cf153aa4118fcf13e673c055d46/internvl_chat/internvl/train/dataset.py#L276
+def build_transform(input_size, pixel_mean, pixel_std, vision_model_type):
+    if vision_model_type in ("siglip", "internvit"):
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=pixel_mean, std=pixel_std)
+        ])
+    elif vision_model_type == "clip":
+        transform = Compose([
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.ToTensor(),
+            T.Normalize(mean=pixel_mean, std=pixel_std),
+        ])
+    else:
+        raise NotImplementedError(f"image processing not defined for vision model {vision_model_type}")
+    return transform
--- a/examples/multimodal/layer_specs.py
+++ b/examples/multimodal/layer_specs.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+try:
+    import apex
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+    warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
+    LNImpl = WrappedTorchNorm
+def get_layer_spec(is_vit, normalization) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+    if normalization == "LayerNorm":
+        norm = LNImpl
+    elif normalization == "RMSNorm":
+        if HAVE_TE:
+            norm = TENorm
+        else:
+            version = torch.__version__.split('.')
+            version_geq_2_4 = (
+                int(TORCH_VERSION[0]) > 2
+                or (
+                    int(TORCH_VERSION[0]) == 2
+                    and int(TORCH_VERSION[1]) >= 4
+                )
+            )
+            assert version_geq_2_4, "Torch version >= 2.4.0 is required for RMSNorm"
+            if HAVE_APEX:
+                warnings.warn(f'Apex does not support RMSNorm. Falling back to Torch Norm')
+            norm = WrappedTorchNorm
+    else:
+        raise RuntimeError("unknown normalization", normalization)
+    mlp = get_mlp_module_spec(use_te=False)  # doesn't include norm.
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=norm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=norm,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+def get_layer_spec_te(is_vit=False) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+    mlp = get_norm_mlp_module_spec_te()
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=IdentityOp,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+def get_norm_mlp_module_spec_te() -> ModuleSpec:
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+        ),
+    )
--- a/examples/multimodal/manual_prompts.json
+++ b/examples/multimodal/manual_prompts.json
+{
+    "COMMENT": "Sources for these prompts include https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
+    "Captioning": {
+        "raw": [
+            "Can you briefly explain what you see in the image?",
+            "Describe what's happening in this image in one short sentence.",
+            "Write a short caption that accurately represents the content of this image.",
+            "Please generate a descriptive caption for the image provided.",
+            "How would you summarize the scene depicted in the picture in short?",
+            "Describe the image briefly.",
+            "Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
+            "Create a concise caption that accurately describes the main elements in the image provided.",
+            "Write a brief, yet comprehensive, description of the image.",
+            "Describe the image in a clear and concise manner.",
+            "For the given image, provide a one-sentence summary that captures the most important details.",
+            "Generate a short caption for the picture.",
+            "Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
+            "Provide a concise and informative caption for the image, focusing on the primary subjects.",
+            "Write a clear description of the image, make sure the key features are well covered.",
+            "Offer a succinct explanation of the picture presented."
+        ]
+    },
+    "CaptioningPretraining": {
+        "raw": [
+            "Generate a short caption of the image.",
+            "Describe the image concisely.",
+            "Provide a brief description of the given image."
+        ],
+        "llava": [
+            "Give a brief description of image.",
+            "Give a brief description of the image.",
+            "Provide a brief description of the given image.",
+            "Provide a one-sentence caption for the provided image.",
+            "Write a terse but informative summary of the picture.",
+            "Describe the image concisely.",
+            "Generate a clear and concise summary of the photo."
+        ]
+    },
+    "OCR": {
+        "raw": [
+            "Can you read the text from image and output here?",
+            "Extract and document the text from the provided image.",
+            "Converting the text embedded in this image into a readable document.",
+            "Transcribe all the text you find.",
+            "Can you extract all visible text from the image here?"
+        ]
+    }
+}
--- a/examples/multimodal/model.py
+++ b/examples/multimodal/model.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import warnings
+from copy import deepcopy
+import torch
+from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
+from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec, get_norm_mlp_module_spec_te
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN, LLaVAModel
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.training import get_args, get_tokenizer, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+def model_provider(
+    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
+) -> LLaVAModel:
+    """Builds the model.
+    Args:
+        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
+            will live on only a subset of the pipeline stages (specifically, only the first stage).
+        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
+            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
+        parallel_output (bool): Enable parallel model output.
+    Returns:
+        model: A multimodal model.
+    """
+    args = get_args()
+    assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."
+    assert args.encoder_pipeline_model_parallel_size <= 1, "LLaVA does not support pp>1 for encoder on it's own pipeline rank"
+    use_te = args.use_te
+    print_rank_0('building a multimodal model ...')
+    num_image_embeddings = get_num_image_embeddings(
+        args.img_h,
+        args.img_w,
+        args.patch_dim,
+        args.vision_model_type,
+        args.disable_vision_class_token,
+        1,
+        args.pixel_shuffle,
+        args.use_tile_tags,
+    )
+    old_seq_length = args.seq_length
+    args.seq_length = args.encoder_seq_length = num_image_embeddings
+    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
+        warnings.warn(
+            f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
+        )
+    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
+    assert (
+        args.decoder_seq_length is not None
+    ), "Please provide --decoder-seq-length to set the language model sequence length"
+    assert (
+        args.decoder_seq_length > max_num_image_embeddings
+    ), "Language model sequence length must be greater than the maximum number of image embeddings"
+    if args.decoder_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.decoder_seq_length
+        warnings.warn(
+            f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length"
+        )
+    base_config = core_transformer_config_from_args(get_args())
+    base_config.language_model_type = args.language_model_type
+    base_config.vision_model_type = args.vision_model_type
+    base_config.calculate_per_token_loss = True
+    language_config = deepcopy(base_config)
+    language_config = get_language_model_config(language_config)
+    if use_te:
+        language_transformer_layer_spec = get_layer_spec_te(
+            is_vit=False
+        )  # TENorm detects LayerNorm/RMS automatically.
+    else:
+        language_transformer_layer_spec = get_layer_spec(
+            is_vit=False, normalization=language_config.normalization
+        )
+    vision_config = deepcopy(base_config)
+    vision_config = get_vision_model_config(
+        vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling
+    )
+    vision_model_type = args.vision_model_type
+    if vision_model_type in ["clip", "siglip"]:
+        if use_te:
+            vision_transformer_layer_spec = get_layer_spec_te(
+                is_vit=True
+            )  # TENorm detects LayerNorm/RMS automatically.
+        else:
+            vision_transformer_layer_spec = get_layer_spec(
+                is_vit=True, normalization=vision_config.normalization
+            )
+    elif vision_model_type == "internvit":
+        from nvlm.internvit import get_internvit_layer_spec
+        vision_transformer_layer_spec = get_internvit_layer_spec(use_te=use_te)
+    else:
+        raise RuntimeError("unsupported vision model type", vision_model_type)
+    vision_projection_config = deepcopy(base_config)
+    vision_projection_config = get_vision_projection_config(
+        vision_projection_config, language_config.hidden_size
+    )
+    # --encoder-pipeline-model-parallel-size 1 will enable a separate pipeline stage for the vision model.
+    if args.encoder_pipeline_model_parallel_size > 0:
+        assert (
+            args.encoder_pipeline_model_parallel_size == 1
+        ), "vision model and projection can only live on 1 pipeline stage."
+        if args.encoder_tensor_model_parallel_size > 0:
+            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
+            vision_projection_config.tensor_model_parallel_size = (
+                args.encoder_tensor_model_parallel_size
+            )
+    # Make sure vision model pipeline parallel size is not inherited from the language model pipeline parallel size.
+    # 0 is not a valid for the config value, hence max(1, ).
+    vision_config.pipeline_model_parallel_size = max(1, args.encoder_pipeline_model_parallel_size)
+    vision_projection_config.pipeline_model_parallel_size = vision_config.pipeline_model_parallel_size
+    # Make sure the vision model does not inherit first and last pipeline num layers from the language model.
+    vision_config.first_pipeline_num_layers = vision_config.last_pipeline_num_layers = None
+    if vision_projection_config.normalization:
+        vision_projection_layer_spec = get_norm_mlp_module_spec_te().submodules
+    else:
+        vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
+    # Toggle --recompute* for the vision and language model separately.
+    if args.recompute_vision:
+        if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
+            vision_config.recompute_num_layers = vision_config.num_layers
+    else:
+        vision_config.recompute_granularity = None
+        vision_config.recompute_method = None
+        vision_config.recompute_num_layers = None
+    vision_projection_config.recompute_granularity = None
+    vision_projection_config.recompute_method = None
+    vision_projection_config.recompute_num_layers = None
+    tokenizer = get_tokenizer()
+    image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+    tile_tags = _get_tile_tags(args, tokenizer)
+    model = LLaVAModel(
+        language_transformer_config=language_config,
+        language_transformer_layer_spec=language_transformer_layer_spec,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.decoder_seq_length,
+        vision_transformer_config=vision_config,
+        vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.disable_vision_class_token,
+        vision_projection_config=vision_projection_config,
+        vision_projection_layer_spec=vision_projection_layer_spec,
+        vision_projection_type="mlp",
+        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
+        parallel_output=parallel_output,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
+        pre_process=pre_process,
+        post_process=post_process,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        img_h=args.img_h,
+        img_w=args.img_w,
+        patch_dim=args.patch_dim,
+        language_rotary_base=args.rotary_base,
+        language_rope_scaling=args.use_rope_scaling,
+        image_token_index=image_token_index,
+        pixel_shuffle=args.pixel_shuffle,
+        tile_tags=tile_tags,
+    )
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+    return model
+def _get_tile_tags(args, tokenizer):
+    """Tile tags are used in NVLM to surround image tiles with text tags."""
+    if not args.use_tile_tags:
+        return None
+    # We expect the tokenized length of the tags is same.
+    thumbnail_tag_text = "<tile_global_thumbnail>"
+    if args.tokenizer_prompt_format == "nvlm-yi-34b":
+        thumbnail_tag_text = "<tile_global>"
+    assert args.max_num_tiles <= 6, "Up to 6 tile tags used"
+    tile_tags_text = [f"<tile_{i}>" for i in range(1, args.max_num_tiles + 1)] + [thumbnail_tag_text]
+    start_idx = 0
+    if tokenizer._prompt_config.has_bos:
+        start_idx = 1
+    # Convert to tokens [num_tiles, tile_seq_len].
+    tile_tags = [tokenizer.tokenize(t)[start_idx:] for t in tile_tags_text]
+    return tile_tags
--- a/examples/multimodal/model_converter/clip_converter.py
+++ b/examples/multimodal/model_converter/clip_converter.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+import torch
+import clip
+def convert(download_root, output_path, tensor_parallel_size, use_te):
+    device = "cuda"
+    model, _ = clip.load("ViT-L/14@336px", device=device, download_root=download_root)
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+    # Indices from mapping pytorch multihead attention to megatron.
+    kv_channels = 64
+    hidden_dim = 1024
+    num_heads = 16
+    indices = []
+    for i in range(num_heads):
+        lb = i * kv_channels
+        ub = (i + 1) * kv_channels
+        indices.append(torch.arange(lb, ub, dtype=torch.int))
+        indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int))
+        indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int))
+    indices = torch.cat(indices)
+    for name, tensor in state_dict.items():
+        # Skip text model.
+        if "visual" not in name:
+            continue
+        # Skip final layers not used in our model.
+        if name == "visual.proj" or "ln_post" in name:
+            continue
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+        if new_tensor.dtype == torch.float16:
+            new_tensor = new_tensor.to(torch.float32)
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+        if "class_embedding" in name:
+            new_name = "class_token"
+            # Our model uses class token that is expanded to input dimensions already.
+            new_tensor = new_tensor.expand(1, 1, -1)
+        elif "positional_embedding" in name:
+            new_name = "position_embeddings.weight"
+        elif "conv1" in name:
+            new_name = "conv1.weight"
+        elif "ln_pre.weight" in name:
+            new_name = "ln_pre.weight"
+        elif "ln_pre.bias" in name:
+            new_name = "ln_pre.bias"
+        elif "transformer.resblocks" in name:
+            layer_idx = name.split(".")[3]
+            base = f"decoder.layers.{layer_idx}"
+            if "attn.in_proj_weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.in_proj_bias" in name:
+                new_name = f"{base}.self_attention.linear_qkv.bias"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.out_proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                chunk_dim = 1
+            elif "attn.out_proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "ln_1.weight" in name:
+                new_name = f"{base}.input_layernorm.weight"
+                if use_te:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight"
+            elif "ln_1.bias" in name:
+                new_name = f"{base}.input_layernorm.bias"
+                if use_te:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias"
+            elif "mlp.c_fc.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.c_fc.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.c_proj.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.c_proj.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "ln_2.weight" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+                if use_te:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_weight"
+            elif "ln_2.bias" in name:
+                new_name = f"{base}.pre_mlp_layernorm.bias"
+                if use_te:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_bias"
+        assert new_name != "", f"unexpected layer name {name}"
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+        for i in range(tensor_parallel_size):
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+            # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+            extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+            is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+            if use_te and is_extra_state_layer:
+                layer = new_name.split(".")[-2]
+                if layer in extra_state_layers:
+                    extra_state_name = (
+                        new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                    )  # Replace the weight name.
+                    new_state_dicts[i]["model"][extra_state_name] = None
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
+        os.makedirs(output_dir_tp)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert OpenAI CLIP VIT weights to megatron format.
+Example usage:
+python clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--download-root", type=str, required=True, help="Download folder for OpenAI CLIP weights"
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
+    )
+    parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
+    args = parser.parse_args()
+    convert(args.download_root, args.output, args.tensor_parallel_size, args.use_te)
+    print("done.")
--- a/examples/multimodal/model_converter/internvit_converter.py
+++ b/examples/multimodal/model_converter/internvit_converter.py
+import argparse
+import os
+import torch
+from transformers import AutoModel
+def convert(model_name, output_path, tensor_parallel_size, use_te):
+    """Convert InternViT HF checkpoint to mcore."""
+    hf_model = AutoModel.from_pretrained(
+        model_name,
+        trust_remote_code=True
+    )
+    hf_state_dict = hf_model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+    hidden_size = 3200
+    num_heads = 25
+    dim = 128
+    order = torch.ones(3 * hidden_size).long()
+    for j in range(num_heads):
+        for i in range(dim):
+            order[i + dim*3*j] = j*dim+i
+            order[dim + i + dim*3*j] = j*dim+i+num_heads*dim
+            order[dim*2 + i + dim*3*j] = j*dim+i+num_heads*dim*2
+    for name, tensor in hf_state_dict.items():
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+        if "embeddings.class_embedding" in name:
+            new_name = "class_token"
+        elif "embeddings.patch_embedding.weight" in name:
+            new_name = "conv1.weight"
+        elif "embeddings.patch_embedding.bias" in name:
+            new_name = "conv1.bias"
+        elif "embeddings.position_embedding" in name:
+            new_name = "position_embeddings.weight"
+            new_tensor = new_tensor.squeeze(0)
+        elif "encoder.layers" in name:
+            layer_idx = name.split(".")[2]
+            base = f"decoder.layers.{layer_idx}"
+            head_dim = 128
+            if tensor_parallel_size == 1:
+                num_padded_heads = 25
+            elif tensor_parallel_size == 8:
+                # Note: 25 is not divisible by 8 and we don't currently support uneven heads split with tensor parallelism.
+                # So we pad with dummy all-zero heads. Please use a nice even number of attention heads in your model.
+                num_padded_heads = 32
+            else:
+                raise NotImplementedError("invalid tensor parallel size value:", tensor_parallel_size)
+            if "ls1" in name:
+                new_name = f"{base}.ls1"
+            elif "ls2" in name:
+                new_name = f"{base}.ls2"
+            elif "attn.qkv.weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                num_tensors = 3
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros((padded_dim, new_tensor.shape[-1]), dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0], :] = new_tensor[order]
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.q_norm.weight" in name:
+                new_name = f"{base}.self_attention.q_layernorm.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.k_norm.weight" in name:
+                new_name = f"{base}.self_attention.k_layernorm.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:new_tensor.shape[0]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 0
+            elif "attn.proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                num_tensors = 1
+                padded_dim = head_dim * num_padded_heads * num_tensors
+                padded_tensor = torch.zeros((new_tensor.shape[0], padded_dim), dtype=new_tensor.dtype, device=new_tensor.device)
+                padded_tensor[:, :new_tensor.shape[-1]] = new_tensor
+                new_tensor = padded_tensor
+                chunk_dim = 1
+            elif "attn.proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "mlp.fc1.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.fc1.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.fc2.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.fc2.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "norm1" in name:
+                new_name = f"{base}.input_layernorm.weight"
+            elif "norm2" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+            else:
+                raise RuntimeError("unexpected transformer layer name", name)
+        else:
+            raise RuntimeError("unexpected layer name", name)
+        assert new_name != "", f"unexpected layer name {name}"
+        # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+        extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+        is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+        if use_te and is_extra_state_layer:
+            layer = new_name.split(".")[-2]
+            if layer in extra_state_layers:
+                extra_state_name = (
+                    new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                )  # Replace the weight name.
+                for i in range(tensor_parallel_size):
+                    new_state_dicts[i]["model"][extra_state_name] = None
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+        for i in range(tensor_parallel_size):
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, f"iter_0000001/mp_rank_0{i}")
+        os.makedirs(output_dir_tp, exist_ok=True)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+        print("saved file", output_path_tp)
+    print("done")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="InternVIT HuggingFace to Mcore converter")
+    parser.add_argument("--model-name", type=str, default="OpenGVLab/InternViT-6B-448px-V1-5", help="Model name in HuggingFace")
+    parser.add_argument("--output-dir", type=str, required=True, help="Output directory for the mcore model.")
+    parser.add_argument("--use-te", action="store_true", default=True)
+    parser.add_argument("--tensor-parallel-size", type=int, required=True)
+    args = parser.parse_args()
+    convert(args.model_name, args.output_dir, args.tensor_parallel_size, args.use_te)
--- a/examples/multimodal/model_converter/siglip_converter.py
+++ b/examples/multimodal/model_converter/siglip_converter.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+from transformers import PaliGemmaForConditionalGeneration
+import torch
+def convert(output_path, tensor_parallel_size, use_te):
+    device = "cuda"
+    model_id = "google/paligemma-3b-pt-448"
+    model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
+    model = model.to(device)
+    print(model.config)
+    for name, tensor in model.state_dict().items():
+        if "vision_model" not in name:
+            continue
+        shape_str = "(" + ", ".join([str(x) for x in tensor.shape]) + ")"
+        print(f"{name:<75} {shape_str:>20}")
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+    def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+        for i in range(tensor_parallel_size):
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+            # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+            extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+            is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+            if use_te and is_extra_state_layer:
+                layer = new_name.split(".")[-2]
+                if layer in extra_state_layers:
+                    extra_state_name = (
+                        new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                    )  # Replace the weight name.
+                    new_state_dicts[i]["model"][extra_state_name] = None
+    for name, tensor in state_dict.items():
+        if tensor.dtype == torch.float16:
+            state_dict[name] = tensor.to(torch.float32)
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"],
+        "position_embeddings.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"],
+        "conv1.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"],
+        "conv1.bias")
+    head_dim = 72
+    num_head = 16
+    for layer_idx in range(27):
+        origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}"
+        target_base = f"decoder.layers.{layer_idx}"
+        for param_type in ["weight", "bias"]:
+            # QKV
+            q_proj_params = state_dict[f"{origin_base}.self_attn.q_proj.{param_type}"]
+            k_proj_params = state_dict[f"{origin_base}.self_attn.k_proj.{param_type}"]
+            v_proj_params = state_dict[f"{origin_base}.self_attn.v_proj.{param_type}"]
+            # Do some tensor manipulation because megatron expect one tensor
+            # projection for the QKV in the order
+            # [(Q1, K1, V1), (Q2, K2, V2), ...] where Qi is the query of the
+            # i-th head with dimension num_head.
+            new_tensor = torch.concatenate([
+                q_proj_params.view(num_head, head_dim, -1),
+                k_proj_params.view(num_head, head_dim, -1),
+                v_proj_params.view(num_head, head_dim, -1)], axis=1).view(
+                    3*head_dim*num_head, -1)
+            if param_type == "bias":
+                new_tensor = new_tensor[:, 0]
+            new_name = f"{target_base}.self_attention.linear_qkv.{param_type}"
+            add_chunck_tensor(new_tensor, new_name, chunk_dim=0)
+            # linear_proj
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.self_attn.out_proj.{param_type}"],
+                f"{target_base}.self_attention.linear_proj.{param_type}",
+                chunk_dim=1 if param_type == "weight" else None)
+            # layer_norm
+            new_name = f"{target_base}.input_layernorm.{param_type}"
+            if use_te:
+                new_name = f"{target_base}.self_attention.linear_qkv.layer_norm_{param_type}"
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.layer_norm1.{param_type}"],
+                new_name)
+            # FC 1
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.mlp.fc1.{param_type}"],
+                f"{target_base}.mlp.linear_fc1.{param_type}",
+                chunk_dim=0)
+            # FC 2
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.mlp.fc2.{param_type}"],
+                f"{target_base}.mlp.linear_fc2.{param_type}",
+                chunk_dim=1 if param_type=="weight" else None)
+            # layer_norm
+            new_name = f"{target_base}.pre_mlp_layernorm.{param_type}"
+            if use_te:
+                new_name = f"{target_base}.mlp.linear_fc1.layer_norm_{param_type}"
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.layer_norm2.{param_type}"],
+                new_name)
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.post_layernorm.weight"],
+        "ln_post.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.post_layernorm.bias"],
+        "ln_post.bias")
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
+        os.makedirs(output_dir_tp)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert SigLIP weights to megatron format.
+Example usage:
+python siglip_converter.py --tensor-parallel-size 4 --output google_paligemma_3b_pt_44_mcore_tp_4 --use-te
+examples/multimodal/combine_mistral_clip.sh Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
+    )
+    parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
+    args = parser.parse_args()
+    convert(args.output, args.tensor_parallel_size, args.use_te)
+    print("done.")
--- a/examples/multimodal/model_converter/vision_model_tester.py
+++ b/examples/multimodal/model_converter/vision_model_tester.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+import sys
+# Add megatron and the multimodal example to the path.
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir)
+    )
+)
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+import torch
+from transformers import AutoModel
+from examples.multimodal.model import model_provider
+from examples.multimodal.multimodal_args import add_multimodal_extra_args
+from megatron.training import get_model
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+def run_mcore_vision(model_path):
+    """Run mcore vision model."""
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+    # Megatron has some mandatory flags.
+    sys.argv = [
+        "ignore_me.py",
+        "--micro-batch-size=1",
+        "--num-layers=2",
+        "--vision-model-type=internvit",
+        "--language-model-type=mistral_7b",
+        "--tokenizer-prompt-format=mistral",
+        "--tokenizer-type=MultimodalTokenizer",
+        "--tokenizer-model=mistralai/Mistral-7B-Instruct-v0.3",
+        "--vocab-size=1024",
+        "--hidden-size=64",
+        "--num-attention-heads=8",
+        "--seq-length=1024",
+        "--decoder-seq-length=2048",
+        "--max-position-embeddings=2048",
+        "--bf16",
+        "--img-h=448",
+        "--img-w=448",
+        "--patch-dim=14",
+        "--tensor-model-parallel-size=8",
+        "--use-te",
+        f"--pretrained-checkpoint={model_path}",
+    ]
+    initialize_megatron(extra_args_provider=add_multimodal_extra_args)
+    def wrapped_model_provider(pre_process, post_process):
+        return model_provider(pre_process, post_process, parallel_output=False)
+    # Set up model and load checkpoint.
+    model = get_model(wrapped_model_provider, wrap_with_ddp=False)
+    vision_model = model[0].module.vision_model
+    load_checkpoint([vision_model], None, None)
+    vision_model.eval()
+    images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
+    output = vision_model(images)
+    return output
+def run_hf_vision(model_name):
+    """Run HF vision model."""
+    model = (
+        AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
+        .cuda()
+        .eval()
+    )
+    images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
+    outputs = model(images, return_dict=True)
+    return outputs
+def main(mcore_model, hf_model):
+    """Compare vision model outputs between mcore and HF given the same fixed input."""
+    mcore = run_mcore_vision(mcore_model)
+    if torch.distributed.get_rank() == 0:
+        hf = run_hf_vision(hf_model)
+        hf = hf["last_hidden_state"]
+        # Compare logits. Due to different attention implementations and other details,
+        # there will be numerical differences.
+        diff = (mcore - hf).abs()
+        mean_diff = diff.mean().item()
+        max_diff = diff.max().item()
+        print(f"mean diff {mean_diff}, max diff {max_diff}")
+        assert mean_diff < 0.1, "mean output difference is greater than expected"
+        assert max_diff < 50, "max output difference is greater than expected"
+        print("lgtm")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check mcore vision model output vs. HF numerically.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--mcore-model", type=str, required=True, help="directory for mcore model weights"
+    )
+    parser.add_argument("--hf-model", type=str, required=True, help="Model name in HF")
+    args = parser.parse_args()
+    main(args.mcore_model, args.hf_model)
--- a/examples/multimodal/multimodal_args.py
+++ b/examples/multimodal/multimodal_args.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
+def add_multimodal_extra_args(parser):
+    """Extra arguments."""
+    group = parser.add_argument_group(title='multimodal arguments')
+    group.add_argument('--dataset-config', type=str, default=None)
+    group.add_argument("--prompt-path", type=str, default=None)
+    group.add_argument('--freeze-LM', action='store_true', default=False)
+    group.add_argument('--freeze-ViT', action='store_true', default=False)
+    group.add_argument('--language-model-type', type=str, required=True)
+    group.add_argument('--vision-model-type', type=str, default="clip")
+    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
+    group.add_argument(
+        "--allow-missing-vision-projection-checkpoint", action="store_true", default=False
+    )
+    group.add_argument("--use-te", action="store_true", default=False)
+    group.add_argument(
+        "--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
+    )
+    group.add_argument(
+        "--use-tiling", action="store_true", default=False, help="Use input image tiling"
+    )
+    group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
+    group.add_argument(
+        "--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
+    )
+    group.add_argument(
+        "--dataloader-seq-length",
+        type=int,
+        help="Make dataloader to produce sequences of specific length.",
+    )
+    group.add_argument(
+        "--num-frames",
+        type=int,
+        default=1,
+        help="Number of frames to regularly sample from the video as input to the model.",
+    )
+    group.add_argument(
+        "--online-evaluation-config", type=str, help="Config file for online evaluation."
+    )
+    group.add_argument(
+        "--special-tokens",
+        nargs="*",
+        default=[IMAGE_TOKEN],
+        help="Special tokens used in the multimodal model",
+    )
+    group.add_argument(
+        "--tokenizer-prompt-format",
+        type=str,
+        choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
+        required=True,
+        help="Prompt format to use with the tokenizer.",
+    )
+    group.add_argument("--pixel-shuffle", action="store_true", default=False)
+    group.add_argument(
+        "--image-tag-type",
+        type=str,
+        choices=["nvlm", "internvl", ""],
+        default="",  # Default: Image tag not used.
+        help="Surround image tokens with tags.",
+    )
+    group.add_argument("--use-tile-tags", action="store_true", default=False, help="Use tile tags")
+    group.add_argument(
+        "--packing-buffer-size",
+        type=int,
+        default=None,   # Packing is disabled by default.
+        help="Enable sample packing by setting the buffer size to > 0",
+    )
+    group.add_argument(
+        "--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
+    )
+    group.add_argument(
+        "--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
+    )
+    return parser
--- a/examples/multimodal/nvlm/README.md
+++ b/examples/multimodal/nvlm/README.md
+NVLM
+====
+Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
+*NOTE: VLMs in Megatron are under active development and are expected to change.*
+# Checkpoints
+NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format.
+- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B)
+- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore) 
+# Setup
+## Docker image
+Please use `examples/multimodal/Dockerfile`.
+## Dataset preparation
+Please refer to Tables 4 and 6 in the [NVLM paper](https://arxiv.org/pdf/2409.11402) for full list of pretrain and SFT datasets.
+Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
+## Model conversion
+### Vision model
+NVLM 1.0 models use [OpenGVLab/InternViT-6B-448px-V1-5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V1-5) from HuggingFace.
+Please download it and run the following command to convert it to Megatron format.
+```
+python examples/multimodal/model_converter/internvit_converter.py --output-dir <some output dir> --use-te --tensor-parallel-size 8
+```
+### 34B Language model
+NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) from HuggingFace.
+Please download it and run the following command to convert it to Megatron format.
+```
+python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
+    --load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model name/directory> \
+    --saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
+```
+### 72B Language model
+NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) from HuggingFace.
+Please download it and run the following command to convert it to Megatron format.
+```
+python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
+    --load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model name/directory> \
+    --saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
+```
+### Combined checkpoint
+Combine the vision model checkpoint from [InternVit](#internvit) with the [34B](#34b-language-model) or [72B](#72b-language-model) language model by running:
+```
+examples/multimodal/combine_lm_vision_checkpoints.sh <language model directory> <vision model directory> <output directory> nvlm
+```
+# Training
+## 34B
+1. Pretraining: please run `examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh`. Please use the InternViT + 34B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
+2. SFT: please run `examples/multimodal/nvlm/sft_34b_internvit.sh` using the checkpoint from 1.
+## 72B
+1. Pretraining: please run `examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh`. Please use the InternViT + 72B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
+2. Convert the pretraining checkpoint from 1. to have pipeline parallel size = 4 for SFT. Please run
+```
+examples/multimodal/nvlm/pp_checkpoint_converter.py --input <pretrained checkpoint directory> \
+--input-pipeline-parallel 1 --output <some output dir> --output-pipeline-parallel 4 \
+--tensor-parallel 8
+```
+3. SFT: please run `examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh` using the checkpoint from 2.
+4. To convert the checkpoint with pipeline parallel size = 4 back to 1 for evaluation, please run
+```
+examples/multimodal/nvlm/pp_checkpoint_converter.py --input <sft checkpoint directory> \
+--input-pipeline-parallel 4 --output <some output dir> --output-pipeline-parallel 1 \
+--tensor-parallel 8
+```
+# Evaluation
+Run the text generation script.
+- 34B
+```
+examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
+    --model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
+```
+- 72B
+```
+examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
+    --model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
+```
+where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning`, `MMMU` or `TextVQA`.
+Then, run one of the evaluation scripts from `examples/multimodal`. For example
+```
+python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
+```
--- a/examples/multimodal/nvlm/internvit.py
+++ b/examples/multimodal/nvlm/internvit.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+""""
+NOTE: NVLM uses InternViT with tensor parallel (TP) size = 8.
+Since InternViT has 25 attention heads and Megatron currently requires the number of attention heads
+to be divisible by the TP size, we add 7 dummy zero attention heads to have 32 attention heads.
+This workaround requires some changes to how we compute RMSNorm, Attention etc.
+Additionally, InternViT introduces some unique features like Layer Scaling.
+Those code changes are gathered here.
+"""
+from functools import partial
+from typing import Dict
+import torch
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.extensions.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TERowParallelLinear,
+)
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
+class InternViTRMSNorm(MegatronModule):
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        eps: float = 1e-6,
+        sequence_parallel: bool = False,
+        compute_var: bool = False,
+    ):
+        """Custom RMSNorm for InternViT.
+        Args:
+            config (TransformerConfig): Config.
+            hidden_size (int): Input hidden size.
+            eps (float): epsilon to use for the norm, default to 1e-6
+            sequence_parallel (bool): Set to true if sequence parallelism is being used,
+              this marks the weights as needing to be allreduced.
+            compute_var (bool): Indicator to compute statistic manually.
+        """
+        super().__init__(config=config)
+        self.config = config
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
+        self._compute_var = compute_var
+        assert not sequence_parallel, "Sequence parallelism is not supported with InternViT."
+        setattr(self.weight, 'sequence_parallel', sequence_parallel)
+    def _norm(self, x, var):
+        if var is None:
+            var = x.pow(2).mean(-1, keepdim=True)
+        return x * torch.rsqrt(var + self.eps)
+    def forward(self, x):
+        """Run RMSNorm with an option to compute custom statistic."""
+        var = None
+        if self._compute_var:
+            unpadded_hidden_size = self.config.hidden_size  # 3200
+            max_dim = x.shape[-1]  # 128
+            x = x.reshape(x.size(0), x.size(1), -1)
+            var = self._gather_var(x.float().pow(2), max_dim) / unpadded_hidden_size
+        output = self._norm(x.float(), var).type_as(x)
+        output = output * self.weight
+        if self._compute_var:
+            output = output.reshape(output.size(0), output.size(1), -1, max_dim)
+        return output
+    def _gather_var(self, input_, max_dim, valid_ranks=6):
+        """Compute statistic across the non-dummy heads."""
+        world_size = get_tensor_model_parallel_world_size()
+        assert world_size == 8, "tested only with TP=8"
+        # Size and dimension.
+        last_dim = input_.dim() - 1
+        rank = get_tensor_model_parallel_rank()
+        if rank < valid_ranks:  # Ranks 0-5 have 24 non-dummy attention heads.
+            var = input_.sum(-1, keepdim=True)
+        elif rank == valid_ranks:  # Rank 6 has 1 non-dummy attention head.
+            var = input_[..., :max_dim].sum(-1, keepdim=True)
+        else:
+            var = input_.sum(-1, keepdim=True) * 0.0  # Zero-out the dummy heads.
+        tensor_list = [torch.empty_like(var) for _ in range(world_size)]
+        tensor_list[rank] = var
+        torch.distributed.all_gather(tensor_list, var, group=get_tensor_model_parallel_group())
+        output = torch.cat(tensor_list, dim=last_dim).contiguous()
+        return output.sum(-1, keepdim=True)
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}):
+        # in InternVitSelfAttention the q_layernorm and k_layernorm weights
+        # are tensor-parallel so must be converted to sharded tensors
+        if 'q_layernorm' in prefix or 'k_layernorm' in prefix:
+            state_dict = self.state_dict(prefix='', keep_vars=True)
+            return make_sharded_tensors_for_checkpoint(
+                state_dict, prefix, {'weight': 0}, sharded_offsets
+            )
+        else:
+            return super().sharded_state_dict(prefix, sharded_offsets, metadata)
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+# Handle InternViT's layer scaling.
+def _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training):
+    x, bias = x_with_bias  # unpack
+    residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
+    if bias is not None:
+        x = x + bias
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out * ls
+        return out
+    else:
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out * ls
+        return out
+def bias_dropout_add_unfused_internvit(ls, training):
+    """Bias-dropout-add as in Megatron but with added LayerScaling handling."""
+    def _bias_dropout_add(x_with_bias, residual, prob):
+        return _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training)
+    return _bias_dropout_add
+def get_bias_dropout_add_internvit(ls, training, fused):
+    """Bias-dropout-add as in Megatron but with added LayerScaling handling."""
+    assert not fused, "Fused bias-dropout-add not implemented for InternViT."
+    return bias_dropout_add_unfused_internvit(ls, training)
+# Add InternViT specialties to our default TransformerLayer.
+class InternViTTransformerLayer(TransformerLayer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.ls1 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
+        self.ls2 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
+        self.self_attn_bda = partial(self.self_attn_bda, self.ls1)
+        self.mlp_bda = partial(self.mlp_bda, self.ls2)
+# Override a few things that are special in InternViT and not supported by the SelfAttention class.
+class InternViTSelfAttention(SelfAttention):
+    def __init__(
+        self, config: TransformerConfig, submodules: SelfAttentionSubmodules, *args, **kwargs
+    ):
+        super().__init__(config=config, submodules=submodules, *args, **kwargs)
+        # Need to override linear_qkv, q_layernorm and k_layernorm.
+        qkv_bias = False
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='qkv',
+        )
+        qk_layernorm_hidden_size = (
+            self.hidden_size_per_attention_head * self.num_attention_heads_per_partition
+        )  # 512 for internvit
+        self.q_layernorm = build_module(
+            submodules.q_layernorm,
+            hidden_size=qk_layernorm_hidden_size,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+            compute_var=True,
+        )
+        self.k_layernorm = build_module(
+            submodules.k_layernorm,
+            hidden_size=qk_layernorm_hidden_size,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+            compute_var=True,
+        )
+class InternViTTEDotProductAttention(TEDotProductAttention):
+    """Adjusted Attention for InternViT"""
+    def forward(self, *args, **kwargs):
+        """Regular TEDotProductAttention + zero-out dummy attention heads."""
+        out = super().forward(*args, **kwargs)
+        # This makes sure the dummy attention heads are zeroed out.
+        mask = torch.ones_like(out, dtype=out.dtype, device=out.device)
+        rank = get_tensor_model_parallel_rank()
+        max_dim = out.shape[-1]  # 128
+        valid_ranks = 6
+        if rank == valid_ranks:
+            mask[..., max_dim:] *= 0.0
+        elif rank > valid_ranks:
+            mask *= 0.0
+        out *= mask
+        return out
+def get_internvit_layer_spec(use_te) -> ModuleSpec:
+    mlp = get_mlp_module_spec(use_te)  # no norm
+    return ModuleSpec(
+        module=InternViTTransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=InternViTRMSNorm,
+            self_attention=ModuleSpec(
+                module=InternViTSelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+                    core_attention=TEDotProductAttention if use_te else DotProductAttention,
+                    linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
+                    q_layernorm=InternViTRMSNorm,
+                    k_layernorm=InternViTRMSNorm,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add_internvit,
+            pre_mlp_layernorm=InternViTRMSNorm,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add_internvit,
+        ),
+    )
--- a/examples/multimodal/nvlm/nvlm_prompts.json
+++ b/examples/multimodal/nvlm/nvlm_prompts.json
+{
+    "COMMENT": "Mixture of our own custom prompts and some prompts from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
+    "Captioning": {
+        "raw": [
+        "Can you briefly explain what you see in the image?",
+        "Describe what's happening in this image in one short sentence.",
+        "Write a short caption that accurately represents the content of this image.",
+        "Please generate a descriptive caption for the image provided.",
+        "How would you summarize the scene depicted in the picture in short?",
+        "Describe the image briefly.",
+        "Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
+        "Create a concise caption that accurately describes the main elements in the image provided.",
+        "Write a brief, yet comprehensive, description of the image.",
+        "Describe the image in a clear and concise manner.",
+        "For the given image, provide a one-sentence summary that captures the most important details.",
+        "Generate a short caption for the picture.",
+        "Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
+        "Provide a concise and informative caption for the image, focusing on the primary subjects.",
+        "Write a clear description of the image, make sure the key features are well covered.",
+        "Offer a succinct explanation of the picture presented."
+        ]
+    },
+    "CaptioningPretraining": {
+        "raw": [
+        "Give a brief description of image.",
+        "Give a brief description of the image.",
+        "Provide a brief description of the given image.",
+        "Provide a one-sentence caption for the provided image.",
+        "Write a terse but informative summary of the picture.",
+        "Describe the image concisely.",
+        "Generate a clear and concise summary of the photo."
+        ]
+    },
+    "CaptioningSFT": {
+        "raw": [
+        "Give a brief description of the image.",
+        "Give a short and clear explanation of the subsequent image.",
+        "Present a compact description of the photo's key features.",
+        "Provide a brief description of the given image.",
+        "Provide a one-sentence caption for the provided image.",
+        "Render a clear and concise summary of the photo.",
+        "Share a concise interpretation of the image provided.",
+        "Summarize the visual content of the image.",
+        "Write a terse but informative summary of the picture.",
+        "Describe the image concisely."
+        ]
+    },
+    "VQAPretraining": {
+        "raw": [
+        "Question: {} Short answer:",
+        "Question: {} Answer:"
+        ]
+    },
+    "VQASFT": {
+        "raw": [
+        "{}",
+        "{}\nAnswer the question using a single word or phrase."
+        ],
+        "docvqa": [
+        "{}",
+        "{}\nAnswer this question using the text in the image directly."
+        ]
+    },
+    "DocPretraining": {
+        "raw": [
+        "Retrieve the text from the given pdf image.",
+        "Extract the text from the provided document.",
+        "Transcribe the text displayed in the image."
+        ],
+        "ocr_multi": [
+        "Apply grounded Optical Character Recognition (OCR) to the provided image.",
+        "Extract all texts and their bounding boxes from the given image using grounded OCR.",
+        "Extract and transcribe all visible text from the provided image, ensuring accurate spatial recognition.",
+        "Conduct a detailed optical character recognition analysis on this image, maintaining the text's original layout and positioning.",
+        "Execute a thorough text recognition procedure on this visual input, ensuring that the spatial arrangement of the text is accurately represented.",
+        "Perform an in-depth OCR scan of the image, capturing both the content and contextual positioning of all textual information.",
+        "OCR with grounding:"
+        ],
+        "md": [
+        "Extract the text from the given image and format it in Markdown.",
+        "Convert the text from the provided image into Markdown format.",
+        "Transform the text from the given image into Markdown syntax.",
+        "Extract and convert the text from the image to Markdown.",
+        "Retrieve the text from the image and present it in Markdown format."
+        ],
+        "grounded_ocr": [
+        "{}. Text:",
+        "Recognize the text in this region: {}.",
+        "Identify the text in this area: {}.",
+        "Detect the text within this section: {}."
+        ],
+        "referring_grounding": [
+        "Region of \"{}\" is:",
+        "Locate the text \"{}\" in the image.",
+        "Identify the text \"{}\" in the image and provide the coordinates."
+        ]
+    },
+    "CaptioningDetailed": {
+        "raw": [
+        "Create a comprehensive paragraph that captures the essence of the image while weaving a cohesive narrative around its elements.",
+        "Compose a paragraph that thoroughly describes the image's content, providing context and connections between different aspects of the scene.",
+        "Provide a detailed, paragraph-length description of the image that paints a vivid picture and tells a coherent story.",
+        "Write a rich and engaging paragraph that delves into the image's components, describing not only what is seen but also how the elements relate to one another.",
+        "Give a well-rounded, paragraph-length explanation of the image, describing the scene and its components while forming a complete and engaging narrative.",
+        "Produce a paragraph that not only describes the individual elements in the image but also weaves them together to form a cohesive, connected account.",
+        "Construct a paragraph that captures the image's details and context, offering a more in-depth and engaging story than a simple caption.",
+        "Compose a descriptive paragraph that brings the image to life through detailed storytelling, connecting the various visual elements into a unified narrative.",
+        "Create a paragraph that provides an extensive and interconnected description of the image, ensuring that the narrative is both detailed and cohesive.",
+        "Write a compelling and detailed paragraph that delves into the image's components, linking them together to create a unified and engaging story."
+        ]
+    },
+    "OCR": {
+        "raw": [
+        "Can you read the text from image and output here?",
+        "Extract and document the text from the provided image.",
+        "Converting the text embedded in this image into a readable document.",
+        "Transcribe all the text you find.",
+        "Can you extract all visible text from the image here?"
+        ],
+        "markdown": [
+        "Can you extract all visible text from the provided image?",
+        "Converting the text embedded in this image into a readable markdown document.",
+        "Can you read the text in the document as markdown?",
+        "Transcribe the document as markdown.",
+        "Extract and document the text from the provided image."
+        ],
+        "table_markdown": [
+        "Can you extract all visible text from the provided table?",
+        "Can you read the text in the provided table as markdown?",
+        "Transcribe the table as markdown.",
+        "Extract and document the text from the provided table image."
+        ],
+        "plain": [
+        "Transcribe the document as plain text.",
+        "Extract and document the text from the provided image.",
+        "Converting the text embedded in this image into a readable document.",
+        "Transcribe all the text you find.",
+        "Can you extract all visible text from the image here?"
+        ],
+        "bbox_plain": [
+        "Transcribe the document as plain text along with bounding boxes.",
+        "Extract and document the text from the provided image along with bounding boxes.",
+        "Converting the text embedded in this image into a readable documen along with bounding boxes.",
+        "Can you extract all visible text with bounding boxes from the image here?"
+        ]
+    },
+    "VQA": {
+        "raw": [
+        "Given the image, answer the following question with few words.",
+        "Answer the following question: ",
+        "What is the answer to this question?",
+        "Write the answer: ",
+        "Please answer this question: "
+        ]
+    },
+    "Embedded": {
+        "raw": [
+        "Given the image, answer the following question with few words.",
+        "Answer the following question: ",
+        "What is the answer to this question?",
+        "Write the answer: ",
+        "Please answer this question: "
+        ]
+    }
+}
--- a/examples/multimodal/nvlm/pp_checkpoint_converter.py
+++ b/examples/multimodal/nvlm/pp_checkpoint_converter.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import argparse
+import os
+import sys
+import torch
+# Add megatron to the path.
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir))
+)
+def split(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
+    """Split pipeline parallel size = 1 checkpoint to pipeline parallel size N."""
+    for tp in range(num_tp):
+        path = os.path.join(input_dir, f"mp_rank_0{tp}", "model_optim_rng.pt")
+        sd = torch.load(path)
+        if num_layers_per_pp_rank is None:
+            num_layers = sd["args"].num_layers
+            assert num_layers % output_pp == 0, "specify --num-layers-per-pp-rank for an uneven split"
+            num_layers_per_pp_rank = [num_layers // output_pp] * output_pp
+        layer_lb = 0
+        for pp in range(output_pp):
+            assert num_layers_per_pp_rank[pp] > 0, "each pp rank must have at least 1 layer"
+            layer_ub = layer_lb + num_layers_per_pp_rank[pp]
+            new_sd = sd.copy()
+            new_sd["model"] = dict()
+            for k, v in sd["model"].items():
+                # First pp rank has vision model.
+                if pp == 0 and ("vision_model" in k or "vision_projection" in k):
+                    new_sd["model"][k] = v
+                    continue
+                # Only the first pp rank has the word embeddings.
+                if "language_model.embedding.word_embeddings" in k and pp == 0:
+                    new_sd["model"][k] = v
+                # Only the last pp rank has the output layer.
+                if "language_model.output_layer" in k and pp == output_pp - 1:
+                    new_sd["model"][k] = v
+                # Only the last pp rank has final layer norm.
+                if "language_model.decoder.final_layernorm" in k and pp == output_pp - 1:
+                    new_sd["model"][k] = v
+                if "language_model.decoder.layers" in k:
+                    layer_num = int(k.split(".")[3])
+                    if layer_lb <= layer_num and layer_num < layer_ub:
+                        # On all pp ranks, megatron starts layer nums from 0!
+                        new_layer_num = int(layer_num - layer_lb)
+                        k_splitted = k.split(".")
+                        k_splitted[3] = str(new_layer_num)
+                        new_k = ".".join(k_splitted)
+                        new_sd["model"][new_k] = v
+            output_dir = os.path.join(base_output_dir, f"iter_0000001/mp_rank_0{tp}_00{pp}")
+            os.makedirs(output_dir, exist_ok=True)
+            output_path = os.path.join(output_dir, "model_optim_rng.pt")
+            torch.save(new_sd, output_path)
+            print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{output_pp - 1}")
+            layer_lb = layer_ub
+    # This is needed for megatron checkpoint loading.
+    with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
+        f.write("1")
+def combine(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
+    """Combine pipeline parallel size = N checkpoint to pipeline parallel size 1."""
+    for tp in range(num_tp):
+        new_sd = None
+        layer_num_offset = 0
+        max_layer_num = 0
+        for pp in range(input_pp):
+            path = os.path.join(input_dir, f"mp_rank_0{tp}_00{pp}", "model_optim_rng.pt")
+            sd = torch.load(path)
+            if pp == 0:
+                new_sd = sd.copy()
+                new_sd["model"] = dict()
+                new_sd["args"].pipeline_model_parallel_size = 1
+            assert new_sd is not None
+            for k, v in sd["model"].items():
+                # First pp rank has vision model.
+                if pp == 0 and ("vision_model" in k or "vision_projection" in k):
+                    new_sd["model"][k] = v
+                    continue
+                # Only the first pp rank has the word embeddings.
+                if "language_model.embedding.word_embeddings" in k and pp == 0:
+                    new_sd["model"][k] = v
+                # Only the last pp rank has the output layer.
+                if "language_model.output_layer" in k and pp == input_pp - 1:
+                    new_sd["model"][k] = v
+                # Only the last pp rank has final layer norm.
+                if "language_model.decoder.final_layernorm" in k and pp == input_pp - 1:
+                    new_sd["model"][k] = v
+                if "language_model.decoder.layers" in k:
+                    layer_num = int(k.split(".")[3])
+                    # On all pp ranks, megatron starts layer nums from 0!
+                    new_layer_num = layer_num_offset + layer_num
+                    if new_layer_num > max_layer_num:
+                        max_layer_num = new_layer_num
+                    k_splitted = k.split(".")
+                    k_splitted[3] = str(new_layer_num)
+                    new_k = ".".join(k_splitted)
+                    new_sd["model"][new_k] = v
+            print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{input_pp - 1}")
+            layer_num_offset = max_layer_num + 1
+        output_dir = os.path.join(base_output_dir, f"iter_0000001/mp_rank_0{tp}")
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "model_optim_rng.pt")
+        torch.save(new_sd, output_path)
+    # This is needed for megatron checkpoint loading.
+    with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
+        f.write("1")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Change pipeline parallelism for a model",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--input", type=str, required=True, help="Input model directory"
+    )
+    parser.add_argument(
+        "--input-pipeline-parallel", type=int, required=True, help="Input model pipeline parallelism"
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="Output model directory"
+    )
+    parser.add_argument(
+        "--output-pipeline-parallel", type=int, required=True, help="Output model pipeline parallelism"
+    )
+    parser.add_argument(
+        "--tensor-parallel", type=int, required=True, help="Model tensor parallel size",
+    )
+    parser.add_argument(
+        "--num-layers-per-pp-rank", type=int, default=None, nargs="*", help="Specify this for uneven pipeline parallel split",
+    )
+    args = parser.parse_args()
+    f = None
+    if args.input_pipeline_parallel == 1 and args.output_pipeline_parallel > 1:
+        f = split
+    elif args.input_pipeline_parallel > 1 and args.output_pipeline_parallel == 1:
+        f = combine
+    else:
+        raise NotImplementedError("Only pipeline parallel 1 to N and N to 1 are supported")
+    f(args.input, args.output, args.input_pipeline_parallel, args.output_pipeline_parallel, args.tensor_parallel, args.num_layers_per_pp_rank)
+    print("done.")
--- a/examples/multimodal/nvlm/pretrain_blend.yaml
+++ b/examples/multimodal/nvlm/pretrain_blend.yaml
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 0.579   # Datasets are weighted according to their size. Weights sum up to 1.
+        path: <path to laion dataset>
+        subflavors:
+          augmentation: False
+      - weight: 0.02
+        path: <path to coco>
+        subflavors:
+          augmentation: False
+      - weight: 0.01
+        path: <path to vqav2 dataset>
+        subflavors:
+          augmentation: False
+      # Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets.
+      # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
+  val:
+    datasets:
+      - weight: 1.
+        path: <path to validation dataset>
+        subflavors:
+          augmentation: False
--- a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
+#!/bin/bash
+# Your SBATCH commands here if using SLURM.
+# Please launch this script from megatron-lm root.
+# Train a multimodal model.
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM="false"
+DEBUG=0
+if [[ $BATCH -eq 0 ]]; then
+    DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+    MODEL_NAME="mcore-qwen20-72b-internvit-${DATETIME}"
+else
+    MODEL_NAME="mcore-qwen20-72b-internvit"
+fi
+WORKSPACE="<some dir>"
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+CHECKPOINT_DIR="${WORKSPACE}/combined-qwen2.0-72b-instruct-internvit-6b-448px-1.5-tp8-te"
+DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
+if [[ $DEBUG -eq 1 ]]; then
+    MBZ=1
+    BZ=1
+    NW=0
+    AD=0.0
+    HD=0.0
+    LI=1
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+else
+    MBZ=1
+    BZ=2048
+    NW=8
+    AD=0.1
+    HD=0.1
+    LI=5
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+fi
+SEQ_LEN=256     # Image embeddings sequence length.
+DECODER_SEQ_LEN=512     # Language model sequence length.
+MAX_POS_EMBED=512
+OPTIONS=" \
+    --use-checkpoint-args \
+    --exit-duration-in-mins 230 \
+    --disable-bias-linear \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model Qwen/Qwen2-72B-Instruct \
+    --tokenizer-prompt-format qwen2p0 \
+    --transformer-impl transformer_engine \
+    --normalization RMSNorm \
+    --norm-epsilon 1e-06 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --attention-dropout ${AD} \
+    --hidden-dropout ${HD} \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --swiglu \
+    --tensor-model-parallel-size 8  \
+    --pipeline-model-parallel-size 1  \
+    --num-layers 80 \
+    --hidden-size 8192 \
+    --ffn-hidden-size 29568 \
+    --add-qkv-bias \
+    --num-attention-heads 64  \
+    --use-distributed-optimizer \
+    --use-te \
+    --num-workers ${NW} \
+    --seq-length ${SEQ_LEN} \
+    --decoder-seq-length ${DECODER_SEQ_LEN} \
+    --max-position-embeddings 32768 \
+    --train-samples 122880000 \
+    --lr-decay-samples 25600000 \
+    --lr-warmup-samples 83200 \
+    --micro-batch-size ${MBZ} \
+    --global-batch-size ${BZ} \
+    --lr 1e-4 \
+    --min-lr 2.5e-5 \
+    --lr-decay-style cosine \
+    --log-interval ${LI} \
+    --eval-iters 10 \
+    --eval-interval 500 \
+    --data-path ${DATA_TRAIN} \
+    --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
+    --save-interval 5000 \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --split 100,0,0 \
+    --clip-grad 10.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --bf16 \
+    --eod-mask-loss \
+    --freeze-ViT \
+    --freeze-LM \
+    --patch-dim 14 \
+    --img-h 448 \
+    --img-w 448 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type qwen2.0_72B \
+    ${EXTRA_ARGS} \
+    --allow-missing-vision-projection-checkpoint \
+    --vision-model-type internvit \
+    --disable-vision-class-token \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --ckpt-format torch \
+    --pixel-shuffle \
+    --image-tag-type nvlm
+"
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
+# Interactive or batch mode
+if [[ $BATCH -eq 0 ]]; then
+    torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
+else
+    run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
+    DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+    srun -l --verbose \
+    --container-image <path to docker image> \
+    --container-mounts "<some mount>" \
+    --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
+    sh -c "${run_cmd}"
+    set +x
+fi
--- a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
+#!/bin/bash
+# Your SBATCH commands here if using SLURM.
+# Please launch this script from megatron-lm root.
+# Train a multimodal model.
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM="false"
+DEBUG=0
+if [[ $BATCH -eq 0 ]]; then
+    DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
+    MODEL_NAME="mcore-nous-yi34b-internvit-mlp-${DATETIME}"
+else
+    MODEL_NAME="mcore-nous-yi34b-internvit-mlp"
+fi
+WORKSPACE="<some dir>"
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+LOAD_NAME="combined-yi-34b-internvit-tp8-mcore"
+CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}"
+DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
+if [[ $DEBUG -eq 1 ]]; then
+    MBZ=1
+    BZ=1
+    NW=0
+    LI=1
+    AD=0.0
+    HD=0.0
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+else
+    MBZ=1
+    BZ=2048
+    NW=8
+    LI=5
+    AD=0.1
+    HD=0.1
+    EXTRA_ARGS=""
+    ALLOW_NONDETERMINISTIC=1
+fi
+SEQ_LEN=256     # Image embeddings sequence length.
+DECODER_SEQ_LEN=512     # Language model sequence length.
+MAX_POS_EMBED=512
+OPTIONS=" \
+    --swiglu \
+    --use-distributed-optimizer \
+    --num-workers ${NW} \
+    --num-layers 60 \
+    --hidden-size 7168 \
+    --normalization RMSNorm \
+    --num-attention-heads 56 \
+    --exit-duration-in-mins 230 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 20480 \
+    --seq-length ${SEQ_LEN} \
+    --decoder-seq-length ${DECODER_SEQ_LEN} \
+    --max-position-embeddings ${MAX_POS_EMBED} \
+    --tokenizer-type MultimodalTokenizer \
+    --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
+    --tokenizer-prompt-format nvlm-yi-34b \
+    --vocab-size 64000 \
+    --make-vocab-size-divisible-by 1 \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 5000000 \
+    --disable-bias-linear \
+    --tensor-model-parallel-size 8 \
+    --language-model-type yi-34b \
+    --vision-model-type internvit \
+    --micro-batch-size ${MBZ} \
+    --global-batch-size ${BZ} \
+    --train-samples 122880000 \
+    --lr-decay-samples 25600000 \
+    --lr-warmup-samples 83200 \
+    --lr 1e-4 \
+    --min-lr 2.5e-5 \
+    --lr-decay-style cosine \
+    --clip-grad 10.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --attention-dropout ${AD} \
+    --hidden-dropout ${HD} \
+    --eod-mask-loss \
+    --bf16 \
+    --tensorboard-dir=${TENSORBOARD_DIR} \
+    --freeze-LM \
+    --freeze-ViT \
+    --img-h 448 \
+    --img-w 448 \
+    --patch-dim 14 \
+    --data-path ${DATA_TRAIN} \
+    --dataloader-type external \
+    --split 100,0,0 \
+    --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
+    --log-interval ${LI} \
+    --save-interval 2000 \
+    --eval-interval 500 \
+    --eval-iters 10 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    ${EXTRA_ARGS} \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --allow-missing-vision-projection-checkpoint \
+    --disable-vision-class-token \
+    --use-te \
+    --use-checkpoint-args \
+    --ckpt-format torch \
+    --pixel-shuffle \
+    --image-tag-type nvlm
+    "
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
+export NVTE_APPLY_QK_LAYER_SCALING=0
+# Interactive or batch mode
+if [[ $BATCH -eq 0 ]]; then
+    torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
+else
+    run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
+    DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+    srun -l --verbose \
+    --container-image <path to docker image> \
+    --container-mounts "<some mount>" \
+    --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
+    sh -c "${run_cmd}"
+    set +x
+fi