Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
+# SPDX-License-Identifier: Apache-2.0
 """Types for writing multimodal model tests."""
 from enum import Enum
 from pathlib import PosixPath

--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the classification outputs of HF and vLLM models.
 Run `pytest tests/models/test_cls_models.py`.

--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the embedding outputs of HF and vLLM models.
 Run `pytest tests/models/embedding/language/test_embedding.py`.

--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
+# SPDX-License-Identifier: Apache-2.0
 import importlib.util
 import math
 from array import array

--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the scoring outputs of HF and vLLM models.
 Run `pytest tests/models/embedding/language/test_scoring.py`.

--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List, Sequence
 import torch

--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+# SPDX-License-Identifier: Apache-2.0
 from functools import partial
 from typing import Callable, Dict, List, Type

--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List, Type
 import os
 import pytest
 import torch.nn.functional as F
-import transformers
 from transformers import AutoModelForVision2Seq
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
@@ -56,6 +57,10 @@ def _run_test(
    with hf_runner(model, dtype=dtype,
                   auto_cls=AutoModelForVision2Seq) as hf_model:
+        # Patch the issue where generation_config.json is missing
+        hf_model.processor.patch_size = \
+            hf_model.model.config.vision_config.patch_size
        # Patch the issue where image_token_id
        # exceeds the maximum allowed vocab size
        hf_model.model.resize_token_embeddings(
@@ -87,8 +92,6 @@ def _run_test(
    )
-@pytest.mark.skipif(transformers.__version__ >= "4.46",
-                    reason="Model broken with changes in transformers 4.46")
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])

--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List, Type
 import os

--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
 Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.

--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for BART models using greedy sampling.
 Run `pytest tests/models/encoder_decoder/language/test_bart.py`.

--- a/tests/models/encoder_decoder/vision_language/test_broadcast.py
+++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py
+# SPDX-License-Identifier: Apache-2.0
 import pytest
 import os

--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
+# SPDX-License-Identifier: Apache-2.0
 from functools import partial
 from typing import List, Optional, Tuple, Type

--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List, Optional, Tuple, Type, overload
 import os

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
+# SPDX-License-Identifier: Apache-2.0
 from functools import partial
 import numpy as np
@@ -139,13 +141,15 @@ def _test_processing_correctness(
 # yapf: disable
-# True if the model supports multiple data items of the modality per request
 @pytest.mark.parametrize("model_id", [
    "rhymes-ai/Aria",
    "Salesforce/blip2-opt-2.7b",
    "facebook/chameleon-7b",
    "deepseek-ai/deepseek-vl2-tiny",
    "adept/fuyu-8b",
+    "h2oai/h2ovl-mississippi-800m",
+    "OpenGVLab/InternVL2-1B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
    "llava-hf/llava-1.5-7b-hf",
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
@@ -154,8 +158,10 @@ def _test_processing_correctness(
    "mistral-community/pixtral-12b",
    "openbmb/MiniCPM-o-2_6",
    "openbmb/MiniCPM-V-2_6",
+    "nvidia/NVLM-D-72B",
    "Qwen/Qwen-VL-Chat",
    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2.5-VL-3B-Instruct",
    "Qwen/Qwen2-Audio-7B-Instruct",
    "fixie-ai/ultravox-v0_3",
 ])

--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for H2OVL's multimodal preprocessing kwargs."""
+from typing import Optional
+import pytest
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.utils import cached_get_tokenizer
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+@pytest.mark.parametrize("model_id", [
+    "h2oai/h2ovl-mississippi-800m",
+    "h2oai/h2ovl-mississippi-2b",
+])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    model_id: str,
+    image_assets: _ImageAssets,
+    size_factors: list[int],
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+    num_imgs: int,
+):
+    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
+                                                  get_h2ovl_target_ratios)
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
+    config = processor.info.get_hf_config()
+    use_msac = config.use_msac
+    mm_processor_kwargs = {
+        "max_dynamic_patch": max_dynamic_patch,
+    }
+    if dynamic_image_size is not None:
+        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
+    min_num = config.min_dynamic_patch
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+    for asset in image_assets:
+        for factor in size_factors:
+            image = rescale_image_size(asset.pil_image, factor)
+            mm_data = {"image": [image] * num_imgs}
+            width, height = image.size
+            # Calculate the expected number of blocks
+            if num_imgs == 1 and use_msac:
+                # First pass
+                blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        min_num,
+                        max_num,
+                        prior_aspect_ratio=None,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,  # Thumbnail is handled separately
+                )
+                # Second pass
+                blocks2, _, _, _ = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        min_num,
+                        max_num,
+                        prior_aspect_ratio=aspect_ratio,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,
+                )
+                # Add thumbnail if use_thumbnail is True and total_blocks > 1
+                if config.use_thumbnail:
+                    blocks1 += 1 if blocks1 > 1 else 0
+                    blocks2 += 1 if blocks2 > 1 else 0
+                # Total blocks is the sum of blocks from both passes minus
+                # overlapping
+                total_blocks = blocks1 + blocks2 - 1
+                expected_num_patches = total_blocks
+            else:
+                blocks, _, _, _ = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        min_num,
+                        max_num,
+                        prior_aspect_ratio=None,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,
+                )
+                expected_num_patches = blocks
+                if config.use_thumbnail and expected_num_patches != 1:
+                    expected_num_patches += 1
+            processed_inputs = processor.apply(prompt, mm_data,
+                                               mm_processor_kwargs)
+            pixel_shape = (
+                processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
+            assert pixel_shape[0] == expected_num_patches * num_imgs
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
+# SPDX-License-Identifier: Apache-2.0
 """Tests for Idefics3's multimodal preprocessing kwargs."""
-from typing import Optional
 import os
 import pytest
-import torch
+from transformers import Idefics3Config
-from transformers import AutoImageProcessor, AutoTokenizer
-from vllm.inputs import InputContext, token_inputs
+from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal import MultiModalRegistry
+from vllm.multimodal.utils import cached_get_tokenizer
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -16,163 +14,53 @@ from ....utils import models_path_prefix
 models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def input_processor_for_idefics3():
-    from vllm.model_executor.models.idefics3 import (
-        input_processor_for_idefics3)
-    return input_processor_for_idefics3
-@pytest.fixture()
-def dummy_data_for_idefics3():
-    from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
-    return dummy_data_for_idefics3
-@pytest.fixture()
-def get_max_idefics3_image_tokens():
-    from vllm.model_executor.models.idefics3 import (
-        get_max_idefics3_image_tokens)
-    return get_max_idefics3_image_tokens
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
-def test_input_mapper_override(model: str, image_assets: _ImageAssets,
-                               longest_edge: Optional[int]):
-    """Ensure that the [default] input mapper handles size properly."""
-    mm_processor_kwargs = {
-        "size": {
-            "longest_edge": longest_edge
-        }
-    } if longest_edge is not None else {}
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-    hf_processor = AutoImageProcessor.from_pretrained(model,
-                                                      trust_remote_code=True,
-                                                      **mm_processor_kwargs)
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    image = image_assets[0].pil_image
-    hf_result = hf_processor.preprocess(
-        image,
-        return_tensors="pt",
-    )
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
-    (None, 2873),
-    (168, 169),
-    (336, 169),
-    (400, 338),
-    (672, 338),
-])
-def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
-                             longest_edge: Optional[int],
-                             expected_max_tokens: int):
-    """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-    actual_max_tokens = get_max_idefics3_image_tokens(
-        ctx=InputContext(ctx.model_config),
-        size=size,
-    )
-    assert expected_max_tokens == actual_max_tokens
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
-    (168, 169, 1),
-    (168, 169, 2),
-    (400, 338, 1),
-    (400, 338, 2),
-])
-def test_dummy_data_override(dummy_data_for_idefics3, model: str,
-                             longest_edge: int, toks_per_img: int,
-                             num_imgs: int):
-    """Ensure dummy_data_for_idefics3 handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-    dummy_data = dummy_data_for_idefics3(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        size=size)
-    sequence_data = dummy_data.seq_data
-    # Ensure we have the right number of placeholders per size
-    image_token_id = ctx.get_hf_config().image_token_id
-    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
-    assert img_tok_count == toks_per_img * num_imgs
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
+# yapf: disable
-    (336, 169 * (1**2 + 1), 1),
+@pytest.mark.parametrize(
-    (336, 169 * (1**2 + 1), 2),
+    ("mm_processor_kwargs", "expected_toks_per_img"),
-    (400, 169 * (2**2 + 1), 1),
+    [
-    (400, 169 * (2**2 + 1), 2),
+        ({"size": {"longest_edge": 364}}, 169),
-])
+        ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
-def test_input_processor_override(input_processor_for_idefics3,
+    ])
-                                  image_assets: _ImageAssets, model: str,
+# yapf: enable
-                                  longest_edge: int,
+@pytest.mark.parametrize("num_imgs", [1, 2])
-                                  expected_toks_per_img: int, num_imgs: int):
+def test_processor_override(image_assets: _ImageAssets, model: str,
+                            mm_processor_kwargs: dict[str, object],
+                            expected_toks_per_img: int, num_imgs: int):
    """Ensure input_processor_for_idefics3 handles num_crops properly."""
    # Same as the previous test - don't initialize mm_processor_kwargs
    # in this test and assume that the kwargs will be correctly expanded by
    # the partial when calling the custom input processor.
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
    ctx = build_model_context(
        model_name=model,
        tokenizer_name=model,
        trust_remote_code=True,
        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
    )
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
    # Build the image str / prompt based on the number of images we pass
-    tokenizer = AutoTokenizer.from_pretrained(model)
    placeholders = "<image>" if num_imgs == 1 else "\n".join(
        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
    prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
-    images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
-    processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
+    # Build mm_data
+    image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    # Ensure the placeholders format are correct
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
+        "input_ids"][0]
    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = ctx.get_hf_config().image_token_id

--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
+# SPDX-License-Identifier: Apache-2.0
 """Tests for InternVL's multimodal preprocessing kwargs."""
-from typing import Callable, Optional
+from typing import Optional
 import os
 import pytest
-from transformers import AutoTokenizer
-from vllm.inputs import InputContext, token_inputs
+from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal import MultiModalRegistry
+from vllm.multimodal.utils import cached_get_tokenizer
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
 from ....utils import models_path_prefix
-models = [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")]
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")])
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def input_processor_for_internvl():
-    from vllm.model_executor.models.internvl import InternVLInputPipeline
-    pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
-    return pipeline.input_processor
-@pytest.fixture()
-def dummy_data_for_internvl():
-    from vllm.model_executor.models.internvl import InternVLInputPipeline
-    pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
-    return pipeline.dummy_data
-@pytest.fixture()
-def get_max_internvl_image_tokens():
-    from vllm.model_executor.models.internvl import (
-        get_max_internvl_image_tokens)
-    return get_max_internvl_image_tokens
-@pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("max_dynamic_patch", [1, 4])
 @pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-def test_input_mapper_override(
+@pytest.mark.parametrize("num_imgs", [1, 2])
-    model: str,
+def test_processor_override(
+    model_id: str,
    image_assets: _ImageAssets,
    max_dynamic_patch: int,
    dynamic_image_size: Optional[bool],
-):
-    mm_processor_kwargs = {
-        "max_dynamic_patch": max_dynamic_patch,
-    }
-    if dynamic_image_size is not None:
-        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
-    if dynamic_image_size is False:
-        expected_num_patches = 1
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-    assert vllm_result["pixel_values"].size(1) == expected_num_patches
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
-@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-def test_max_tokens_override(
-    get_max_internvl_image_tokens: Callable,
-    model: str,
-    max_dynamic_patch: Optional[int],
-    dynamic_image_size: Optional[bool],
-):
-    """Ensure get_max_internvl_image_tokens handles mm_processor_kwargs."""
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-    if max_dynamic_patch is None:
-        max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
-    if dynamic_image_size is False:
-        expected_num_patches = 1
-    expected_max_tokens = 256 * expected_num_patches
-    actual_max_tokens = get_max_internvl_image_tokens(
-        ctx=InputContext(ctx.model_config),
-        max_dynamic_patch=max_dynamic_patch,
-        dynamic_image_size=dynamic_image_size,
-    )
-    assert expected_max_tokens == actual_max_tokens
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_imgs", [1, 2])
-@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
-@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-def test_dummy_data_override(
-    dummy_data_for_internvl: Callable,
-    model: str,
    num_imgs: int,
-    max_dynamic_patch: Optional[int],
-    dynamic_image_size: Optional[bool],
 ):
-    """Ensure dummy_data_for_internvl handles kwargs properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
    ctx = build_model_context(
-        model_name=model,
+        model_name=model_id,
-        tokenizer_name=model,
+        tokenizer_name=model_id,
        trust_remote_code=True,
        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
    )
+    tokenizer = cached_get_tokenizer(
-    if max_dynamic_patch is None:
+        ctx.model_config.tokenizer,
-        max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
+        trust_remote_code=ctx.model_config.trust_remote_code,
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
+    )
-    if dynamic_image_size is False:
+    processor = MULTIMODAL_REGISTRY.create_processor(
-        expected_num_patches = 1
+        ctx.model_config,
-    expected_max_tokens = 256 * expected_num_patches
+        tokenizer=tokenizer,
-    dummy_data = dummy_data_for_internvl(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        max_dynamic_patch=max_dynamic_patch,
-        dynamic_image_size=dynamic_image_size,
    )
-    sequence_data = dummy_data.seq_data
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-    image_token_id = tokenizer.encode('<IMG_CONTEXT>',
-                                      add_special_tokens=False)[0]
-    # Ensure we have the right number of placeholders per size
+    mm_processor_kwargs = {
-    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
+        "max_dynamic_patch": max_dynamic_patch,
-    assert img_tok_count == expected_max_tokens * num_imgs
+    }
+    if dynamic_image_size is not None:
+        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+    image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
+    mm_data = {"image": [image] * num_imgs}
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
-@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-@pytest.mark.parametrize("num_imgs", [1, 2])
-def test_input_processor_override(
-    input_processor_for_internvl: Callable,
-    image_assets: _ImageAssets,
-    model: str,
-    num_imgs: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
-):
-    """Ensure input_processor_for_internvl handles kwargs properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
    if dynamic_image_size is False:
        expected_num_patches = 1
-    ctx = build_model_context(
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-    expected_toks_per_img = 256 * expected_num_patches
-    # Build the image str / prompt based on the number of images we pass
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
-    prompt = placeholders
-    images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
-    processed_inputs = input_processor_for_internvl(
-        ctx,
-        inputs,
-        max_dynamic_patch=max_dynamic_patch,
-        dynamic_image_size=dynamic_image_size,
-    )
    # Ensure we have the right number of placeholders per num_crops size
-    image_token_id = tokenizer.encode('<IMG_CONTEXT>',
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
-                                      add_special_tokens=False)[0]
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    assert img_tok_count == expected_toks_per_img * num_imgs
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    assert img_tok_count == 256 * expected_num_patches * num_imgs
+    assert pixel_shape[0] == expected_num_patches * num_imgs
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
+# SPDX-License-Identifier: Apache-2.0
 import itertools
 from functools import partial
@@ -41,7 +43,10 @@ def test_processor_max_tokens(model_id):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
    )
    info = processor.info
@@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
    )
    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@@ -171,7 +179,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
    )
    seen_aspect_ratios = set[float]()

--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
+# SPDX-License-Identifier: Apache-2.0
 import itertools
 from functools import partial
@@ -42,7 +44,10 @@ def test_processor_max_tokens(model_id):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
    )
    info = processor.info
@@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
    )
    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@@ -172,7 +180,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            trust_remote_code=ctx.model_config.trust_remote_code,
+        ),
    )
    seen_aspect_ratios = set[float]()