add

0d99ae1f · silencealiang · c271aaae · 0d99ae1f · 0d99ae1f · 0d99ae1f
Commit 0d99ae1f authored Mar 14, 2025 by silencealiang
20 changed files
--- a/examples/multimodal/evaluate_vqav2.py
+++ b/examples/multimodal/evaluate_vqav2.py
@@ -9,15 +9,22 @@ def merge_input_files(input_path):
    """Merge input files to a format compatible with the evaluator."""
    input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")

-    results = []
+    results = dict()

    for input_file_path in input_file_paths:
        with open(input_file_path, "r") as input_file:
            for line in input_file:
                res = json.loads(line)
-                res["question_id"] = res["sample_id"]
+                sample_id = res["sample_id"]

-                results.append(res)
+                # Skip possible duplicates.
+                if sample_id in results:
+                    continue
+
+                res["question_id"] = sample_id
+                results[sample_id] = res
+
+    results = list(results.values())

    with open(output_file_path, "w") as output_file:
        json.dump(results, output_file)
@@ -57,6 +64,9 @@ def compute_vqa_accuracy(result_file, task):
            assert len(gt) == 1, "expected exactly one groundtruth answer."
            gt = gt[0]

+            pred = pred.rstrip("%")
+            gt = gt.rstrip("%")
+
            if is_number(pred) and is_number(gt):
                pred = float(pred)
                gt = float(gt)

--- a/examples/multimodal/evaluation_datasets.py
+++ b/examples/multimodal/evaluation_datasets.py
@@ -188,7 +188,7 @@ class MMMUDataset(torch.utils.data.Dataset):
        use_tiling,
        max_num_tiles,
        use_thumbnail,
-        single_image,
+        prompt_style,
        vision_model_type,
    ):
        import datasets
@@ -246,7 +246,7 @@ class MMMUDataset(torch.utils.data.Dataset):
        self._use_tiling = use_tiling
        self._max_num_tiles = max_num_tiles
        self._use_thumbnail = use_thumbnail
-        self._single_image = single_image
+        self._prompt_style = prompt_style
        self._vision_model_type = vision_model_type

    def __len__(self):
@@ -258,7 +258,7 @@ class MMMUDataset(torch.utils.data.Dataset):
        sample = self._dataset[idx]

        # Use the single image approach from the MMMU repo.
-        if self._single_image:
+        if self._prompt_style == "single_image":
            sample = process_single_sample(sample)
            sample = construct_prompt(sample, self._config)

@@ -274,7 +274,69 @@ class MMMUDataset(torch.utils.data.Dataset):
                vision_model_type=self._vision_model_type,
            )
            sample_num_tiles = [len(sample_imgs)]
-        else:
+
+            prompt = sample["final_input_prompt"]
+            for i in range(8):
+                prompt = prompt.replace(f"<image {i}>", "")
+            sample["final_input_prompt"] = f"<image>\n{prompt}"
+        elif self._prompt_style == "vlmevalkit":
+            sample = construct_prompt(sample, self._config)
+
+            if sample["question_type"] == "multiple-choice":
+                question = sample["question"]
+
+                options = ""
+                for k, v in sample["index2ans"].items():
+                    options += f"{k}. {v}\n"
+
+                final_prompt = f"{question}\n"
+                if "hint" in sample:
+                    final_prompt += f"Hint: {sample['hint']}\n"
+
+                if "task_instructions" in sample:
+                    final_prompt += f"Task instructions: {sample['task_instructions']}\n"
+
+                final_prompt += options
+                final_prompt += "Answer with the option's letter from the given choices directly."
+
+                sample["final_input_prompt"] = final_prompt.rstrip()
+            else:
+                question = sample["question"]
+                final_prompt = f"{question}\n"
+                final_prompt += "Answer the question directly."
+                sample["final_input_prompt"] = final_prompt.rstrip()
+
+            sample_imgs = []
+            sample_num_tiles = []
+
+            img_indices = sorted(list(set(re.findall(r"<image (\d+)", sample["final_input_prompt"]))))
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+            adjusted_max_num_tiles = min(adjusted_max_num_tiles, self._max_num_tiles)
+
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )  # List of tiles.
+
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+
+            sample["final_input_prompt"] = " ".join([f'<image {i + 1}><image>' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
+        elif self._prompt_style == "multi_image":
            sample = construct_prompt(sample, self._config)

            sample_imgs = []
@@ -315,6 +377,8 @@ class MMMUDataset(torch.utils.data.Dataset):
                assert (
                    f"<image {i}>" not in sample["final_input_prompt"]
                ), "prompt contains unhandled image tags"
+        else:
+            raise ValueError(f"unknown prompt style {self._prompt_style}")

        # MMMU specific metadata.
        metadata = {"question_type": sample["question_type"]}
@@ -323,10 +387,6 @@ class MMMUDataset(torch.utils.data.Dataset):
            metadata["all_choices"] = sample["all_choices"]

        prompt = sample['final_input_prompt']
-        if self._single_image:
-            for i in range(8):
-                prompt = prompt.replace(f"<image {i}>", "")
-            prompt = f"<image>\n{prompt}"

        tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)

@@ -780,8 +840,10 @@ def get_evaluation_dataset(
            vision_model_type,
        )
    elif task == 'MMMU':
-        # Note: single_image=True uses only one image like in the MMMU repo example.
-        # single_image=False uses all images in the sample.
+        # Note:
+        # - prompt_style="single_image" uses only one image like in the MMMU repo example.
+        # - prompt_style="multi_image" uses multiple input images.
+        # - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
        dataset = MMMUDataset(
            input_image_path,
            num_samples_per_partition,
@@ -792,7 +854,7 @@ def get_evaluation_dataset(
            use_tiling,
            max_num_tiles,
            use_thumbnail,
-            single_image=True,
+            prompt_style="single_image",
            vision_model_type=vision_model_type,
        )
    elif task == "VideoMME":

--- a/examples/multimodal/image_processing.py
+++ b/examples/multimodal/image_processing.py
--- a/examples/multimodal/layer_specs.py
+++ b/examples/multimodal/layer_specs.py
--- a/examples/multimodal/manual_prompts.json
+++ b/examples/multimodal/manual_prompts.json
--- a/examples/multimodal/model.py
+++ b/examples/multimodal/model.py
@@ -136,6 +136,20 @@ def model_provider(
    else:
        vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules

+    # Toggle --recompute* for the vision and language model separately.
+    if args.recompute_vision:
+        if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
+            vision_config.recompute_num_layers = vision_config.num_layers
+    else:
+        vision_config.recompute_granularity = None
+        vision_config.recompute_method = None
+        vision_config.recompute_num_layers = None
+
+    vision_projection_config.recompute_granularity = None
+    vision_projection_config.recompute_method = None
+    vision_projection_config.recompute_num_layers = None
+
+
    tokenizer = get_tokenizer()
    image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)


--- a/examples/multimodal/model_converter/clip_converter.py
+++ b/examples/multimodal/model_converter/clip_converter.py
--- a/examples/multimodal/model_converter/internvit_converter.py
+++ b/examples/multimodal/model_converter/internvit_converter.py
--- a/examples/multimodal/model_converter/siglip_converter.py
+++ b/examples/multimodal/model_converter/siglip_converter.py
--- a/examples/multimodal/model_converter/vision_model_tester.py
+++ b/examples/multimodal/model_converter/vision_model_tester.py
--- a/examples/multimodal/multimodal_args.py
+++ b/examples/multimodal/multimodal_args.py
@@ -49,7 +49,7 @@ def add_multimodal_extra_args(parser):
    group.add_argument(
        "--tokenizer-prompt-format",
        type=str,
-        choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0"],
+        choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
        required=True,
        help="Prompt format to use with the tokenizer.",
    )
@@ -71,5 +71,9 @@ def add_multimodal_extra_args(parser):
    group.add_argument(
        "--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
    )
+    group.add_argument(
+        "--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
+    )
+

    return parser
--- a/examples/multimodal/nvlm/README.md
+++ b/examples/multimodal/nvlm/README.md
@@ -5,6 +5,13 @@ Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.

 *NOTE: VLMs in Megatron are under active development and are expected to change.*

+# Checkpoints
+
+NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format.
+
+- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B)
+- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore) 
+
 # Setup

 ## Docker image
@@ -32,7 +39,7 @@ NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface
 Please download it and run the following command to convert it to Megatron format.
 ```
 python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
-    --load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model directory> \
+    --load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model name/directory> \
    --saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
 ```

@@ -42,7 +49,7 @@ NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Q
 Please download it and run the following command to convert it to Megatron format.
 ```
 python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
-    --load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model directory> \
+    --load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model name/directory> \
    --saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
 ```


--- a/examples/multimodal/nvlm/internvit.py
+++ b/examples/multimodal/nvlm/internvit.py
--- a/examples/multimodal/nvlm/nvlm_prompts.json
+++ b/examples/multimodal/nvlm/nvlm_prompts.json
--- a/examples/multimodal/nvlm/pp_checkpoint_converter.py
+++ b/examples/multimodal/nvlm/pp_checkpoint_converter.py
--- a/examples/multimodal/nvlm/pretrain_blend.yaml
+++ b/examples/multimodal/nvlm/pretrain_blend.yaml
--- a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
@@ -62,7 +62,7 @@ OPTIONS=" \
    --exit-duration-in-mins 230 \
    --disable-bias-linear \
    --tokenizer-type MultimodalTokenizer \
-    --tokenizer-model ${WORKSPACE}/<path to tokenizer model> \
+    --tokenizer-model Qwen/Qwen2-72B-Instruct \
    --tokenizer-prompt-format qwen2p0 \
    --transformer-impl transformer_engine \
    --normalization RMSNorm \

--- a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
@@ -75,7 +75,7 @@ OPTIONS=" \
    --decoder-seq-length ${DECODER_SEQ_LEN} \
    --max-position-embeddings ${MAX_POS_EMBED} \
    --tokenizer-type MultimodalTokenizer \
-    --tokenizer-model ${WORKSPACE}/<path to tokenizer> \
+    --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
    --tokenizer-prompt-format nvlm-yi-34b \
    --vocab-size 64000 \
    --make-vocab-size-divisible-by 1 \

--- a/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
@@ -97,7 +97,7 @@ do
        --decoder-seq-length ${DECODER_SEQ_LEN} \
        --max-position-embeddings ${MAX_POS_EMBED} \
        --tokenizer-type MultimodalTokenizer \
-        --tokenizer-model <tokenizer model path> \
+        --tokenizer-model Qwen/Qwen2-72B-Instruct \
        --tokenizer-prompt-format qwen2p0 \
        --position-embedding-type rope \
        --rotary-percent 1.0 \

--- a/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh
+++ b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export TOKENIZERS_PARALLELISM="false"
+
+INPUT_IMAGE_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i|--input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        -t|--task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+
+
+SEQ_LEN=256
+DECODER_SEQ_LEN=8192
+EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --attention-softmax-in-fp32 \
+        --transformer-impl transformer_engine \
+        --use-te \
+        --use-checkpoint-args \
+        --normalization RMSNorm \
+        --norm-epsilon 1e-06 \
+        --language-model-type=qwen2.5_7B \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 1000000 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --tensor-model-parallel-size 4 \
+        --pipeline-model-parallel-size 1 \
+        --group-query-attention \
+        --num-query-groups 4 \
+        --num-layers 28 \
+        --hidden-size 3584 \
+        --ffn-hidden-size 18944 \
+        --add-qkv-bias \
+        --num-attention-heads 28 \
+        --max-position-embeddings 32768  \
+        --no-masked-softmax-fusion \
+        --load ${MODEL_PATH} \
+        --tokenizer-type MultimodalTokenizer \
+        --tokenizer-model Qwen/Qwen2.5-7B-Instruct \
+        --tokenizer-prompt-format qwen2p5 \
+        --bf16 \
+        --micro-batch-size 1 \
+        --seq-length ${SEQ_LEN} \
+        --decoder-seq-length ${DECODER_SEQ_LEN} \
+        --out-seq-length 128 \
+        --temperature 1.0 \
+        --img-h 448 \
+        --img-w 448 \
+        --patch-dim 14 \
+        --seed 153 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --task ${TASK} \
+        ${EXTRA_ARGS} \
+        --special-tokens "<image>" "<img>" "</img>" \
+        --vision-model-type siglip \
+        --ckpt-format torch
+done