Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori

d2b52805 · zhuwenwen · 9a521c23 · 5438967f · d2b52805 · d2b52805
Commit d2b52805 authored Sep 07, 2025 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
-from io import BytesIO
 from typing import Callable
-import requests
+from vllm.assets.image import ImageAsset
-from PIL import Image
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
@@ -118,9 +115,9 @@ def different_patch_input_cases_internvl():
 def windows_attention_image_qwen2_5_vl():
-    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
-    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
-    image = Image.open(BytesIO(requests.get(image_url).content))
+    image = ImageAsset("hato").pil_image
    question = "Describe the image."
    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"

--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -10,6 +10,7 @@ from typing import Optional, Union
 import numpy as np
 import numpy.typing as npt
+import PIL.Image
 import pytest
 import regex as re
 import torch
@@ -19,7 +20,6 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
 from transformers.video_utils import VideoMetadata
 from vllm.sequence import SampleLogprobs
-from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import is_list_of
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
@@ -343,7 +343,6 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
 def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for GLM4V."""
    hf_processor = hf_model.processor
-    patch_padding_side(hf_processor)
    def processor(*args, text="", images=None, **kwargs):
        if images is None:
@@ -812,6 +811,63 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model
+def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Ovis2."""
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.llm.get_output_embeddings()
+    def processor(*args, text="", images=None, videos=None, **kwargs):
+        if images is None:
+            images = []
+        else:
+            images = [images] if isinstance(images, Image) else images
+        if videos is None:
+            videos = []
+        else:
+            videos = [videos] if isinstance(videos, np.ndarray) else videos
+            videos = [[PIL.Image.fromarray(frame) for frame in vid]
+                      for vid in videos]
+        prompt_start_and_end = {
+            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "llama":
+            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+        }
+        for start, end in prompt_start_and_end.values():
+            if start in text and end in text:
+                text = text.split(start)[1].split(end)[0]
+                break
+        images_message = [{"type": "image", "image": img} for img in images]
+        videos_message = [{"type": "video", "video": vid} for vid in videos]
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                *images_message,
+                *videos_message,
+                {
+                    "type": "text",
+                    "text": text
+                },
+            ],
+        }]
+        input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
+            messages=messages, enable_thinking=True)
+        inputs = {
+            "inputs": input_ids,
+            "pixel_values": pixel_values,
+            "grid_thws": grid_thws,
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+    hf_model.processor = processor
+    return hf_model
 def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
    thinker = hf_model.model.thinker

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -14,8 +14,9 @@ from PIL import Image
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
-from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
+from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                               cached_tokenizer_from_config,
                                               encode_tokens)
@@ -63,6 +64,8 @@ def _test_processing_correctness(
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
+        # Ensure that the cache can fit all of the data
+        mm_processor_cache_gb=2048,
    )
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
@@ -71,8 +74,7 @@ def _test_processing_correctness(
        model_config,
        tokenizer=cached_tokenizer_from_config(model_config),
    )
-    # Ensure that it can fit all of the data
+    cache = MultiModalProcessorOnlyCache(model_config)
-    cache = ProcessingCache(capacity_gb=2048)
    processing_info = factories.info(ctx)
    supported_mm_limits = processing_info.get_supported_mm_limits()
@@ -102,7 +104,7 @@ def _test_processing_correctness(
        partial(random_video,
                rng,
                min_frames=2,
-                max_frames=8,
+                max_frames=16,
                min_wh=128,
                max_wh=256),
        "audio":
@@ -160,8 +162,10 @@ def _test_processing_correctness(
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "donut": False,
    "mllama": False,
    "ovis": False,
+    "ovis2_5": False,
    "paligemma": False,
    "ultravox": False,
    "whisper": False,
@@ -267,23 +271,30 @@ def _test_processing_correctness_one(
    "CohereForAI/aya-vision-8b",
    "Salesforce/blip2-opt-2.7b",
    "facebook/chameleon-7b",
+    "CohereLabs/command-a-vision-07-2025",
    "deepseek-ai/deepseek-vl2-tiny",
+    "naver-clova-ix/donut-base-finetuned-docvqa",
+    "baidu/ERNIE-4.5-VL-28B-A3B-PT",
    "microsoft/Florence-2-base",
    "adept/fuyu-8b",
    "google/gemma-3-4b-it",
    "google/gemma-3n-E2B-it",
    "zai-org/glm-4v-9b",
    "zai-org/GLM-4.1V-9B-Thinking",
+    "zai-org/GLM-4.5V",
    "ibm-granite/granite-speech-3.3-2b",
    "h2oai/h2ovl-mississippi-800m",
+    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
    "internlm/Intern-S1",
    "OpenGVLab/InternVL2-1B",
    "OpenGVLab/InternVL3-1B",
-    "HuggingFaceM4/Idefics3-8B-Llama3",
+    "OpenGVLab/InternVL3_5-1B",
-    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
+    "OpenGVLab/InternVL3_5-30B-A3B",
+    "Kwai-Keye/Keye-VL-8B-Preview",
    "moonshotai/Kimi-VL-A3B-Instruct",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
    "llava-hf/llava-1.5-7b-hf",
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
@@ -301,6 +312,7 @@ def _test_processing_correctness_one(
    "AIDC-AI/Ovis1.6-Gemma2-9B",
    "AIDC-AI/Ovis1.6-Llama3.2-3B",
    "AIDC-AI/Ovis2-1B",
+    "AIDC-AI/Ovis2.5-2B",
    "google/paligemma-3b-mix-224",
    "google/paligemma2-3b-ft-docci-448",
    "microsoft/Phi-3.5-vision-instruct",
@@ -312,11 +324,15 @@ def _test_processing_correctness_one(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    "Qwen/Qwen2-Audio-7B-Instruct",
    "Qwen/Qwen2.5-Omni-3B",
+    "YannQi/R-4B",
    "Skywork/Skywork-R1V-38B",
+    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "stepfun-ai/step3",
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
    "openai/whisper-large-v3",
    "omni-research/Tarsier-7b",
    "omni-research/Tarsier2-Recap-7b",
+    "mistralai/Voxtral-Mini-3B-2507",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -370,10 +386,16 @@ def _assert_inputs_equal(
    if ignore_mm_keys is None:
        ignore_mm_keys = set()
-    assert "mm_kwargs" in a and "mm_kwargs" in b, msg
+    a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"}
+    b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"}
+    assert a_rest == b_rest, msg
+    a_data = a["mm_kwargs"].get_data()
+    b_data = b["mm_kwargs"].get_data()
    for key in ignore_mm_keys:
-        a["mm_kwargs"].pop(key, None)
+        a_data.pop(key, None)
-        b["mm_kwargs"].pop(key, None)
+        b_data.pop(key, None)
-    assert a == b, msg
+    assert a_data == b_data, msg
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -45,7 +45,8 @@ def test_processor_override(
    video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
    video_tok_count = processed_inputs["prompt_token_ids"].count(
        video_token_id)
-    grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
+    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
+    )["video_grid_thw"][0]
    assert grid_t == expected_grid_t
    assert video_tok_count == expected_toks_per_frame * grid_t
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -108,7 +108,8 @@ def _run_check(
    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches

--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -68,7 +68,8 @@ def _run_check(
    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches

--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -51,14 +51,14 @@ def test_processor_override(
        prompt = encode_tokens(tokenizer, prompt)
    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
-    mm_kwargs = processed_inputs["mm_kwargs"]
+    mm_data = processed_inputs["mm_kwargs"].get_data()
    # place holder replacements
    prompt_token_ids = processed_inputs["prompt_token_ids"]
    assert prompt_token_ids.count(config.boi_token_index) == num_imgs
    assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
    assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
-    aspect_ratios = mm_kwargs["aspect_ratios"]
+    aspect_ratios = mm_data["aspect_ratios"]
    num_x_separators = num_y_separators = 0
    for tiles_y, tiles_x in aspect_ratios:
        if tiles_x * tiles_y > 1:
@@ -80,6 +80,6 @@ def test_processor_override(
    num_patches_per_chunk = processor.info.get_patch_per_chunk(
        config.vision_config)
    assert prompt_token_ids.count(config.image_token_index) \
-        == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
+        == sum(mm_data["patches_per_image"]) * num_patches_per_chunk
-    assert mm_kwargs["pixel_values"].shape[0] \
+    assert len(mm_data["pixel_values"]) \
-        == mm_kwargs["patches_per_image"].sum()
+        == sum(mm_data["patches_per_image"])
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -49,18 +49,18 @@ def test_profiling(
    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
                        ] * max_num_seqs
-    mm_kwargs = processor.apply(
+    mm_data = processor.apply(
        prompt=dummy_mm_data.prompt,
        mm_data=dummy_mm_data.mm_data,
        hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"]
+    )["mm_kwargs"].get_data()
    # Get the actual number of encoder tokens for each sample.
    # Because attn_metadata.encoder_seq_lens only counts the last
    # group of images for each sample, which is used to cheat the
    # block manager to allocate blocks for those images only.
    # See MllamaMultiModalProcessor for more details.
-    num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")]
+    num_tiles = [[t] for t in mm_data.pop("num_tiles")]
    num_tokens_per_tile = calc_token_per_chunk(image_size)
    actual_encoder_seq_lens = [
        sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles

--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int):
    hf_config = ctx.get_hf_config(Llama4Config)
-    mm_kwargs = processor.apply(
+    mm_data = processor.apply(
        prompt=dummy_mm_data.prompt,
        mm_data=dummy_mm_data.mm_data,
        hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"]
+    )["mm_kwargs"].get_data()
    image_size = hf_config.vision_config.image_size
    patch_size = hf_config.vision_config.patch_size
    downsample_ratio = int(
        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
    tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
-    chunks_per_image = prod(mm_kwargs["patches_per_image"])
+    chunks_per_image = prod(mm_data["patches_per_image"])
    total_num_patches = chunks_per_image * tokens_per_patch
-    num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][
+    num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
-        0][1]  # x-y seperator tokens
+        1]  # x-y seperator tokens
    total_tokens = total_num_patches.item() + num_tiles.item(
    ) + 3  # image start, image, image end

--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -70,7 +70,8 @@ def _run_check(
    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<image>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
    print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches

--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -48,7 +48,8 @@ def test_processor_override(
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values"].shape
    assert img_tok_count == expected_toks_per_img * num_imgs
    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs

--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
 from collections.abc import Iterable
+from contextlib import contextmanager
 from functools import partial
 from typing import Any, Union
-from unittest.mock import patch
 import numpy as np
 import pytest
+import torch.nn as nn
 from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
                                                       UserMessage)
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
-from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.distributed import (cleanup_dist_env_and_memory,
+                              init_distributed_environment,
+                              initialize_model_parallel)
 from vllm.inputs import InputProcessingContext
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-                             MultiModalKwargs)
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
+from vllm.utils import is_list_of
-from vllm.v1.core.kv_cache_utils import get_kv_cache_config
-from vllm.v1.engine.core import EngineCore as V1EngineCore
-from ...conftest import VllmRunner
+from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
-from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
+from ...utils import dummy_hf_overrides
-from ..utils import dummy_hf_overrides
 ARCH_TO_SKIP = {
    "MolmoForCausalLM": "incompatible requirements",
-    "MiniMaxVL01ForConditionalGeneration": "broken model",
 }
 ARCH_NEEDS_EXTRAS = [
    "InternVLChatModel",
@@ -39,7 +39,12 @@ ARCH_NEEDS_EXTRAS = [
    "MiniCPMV",
    "PaliGemmaForConditionalGeneration",
 ]
-REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"}
+REPO_ID_TO_SKIP = {
+    "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
+    # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
+    # after support PP for GPT-OSS
+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
+}
 ImageInput = list[Image.Image]
 VideoInput = Union[list[Image.Image], list[np.ndarray],
@@ -128,11 +133,32 @@ def create_batched_mm_kwargs(
    )["mm_kwargs"]
    items = [
        item for modality in supported_mm_limits
-        for item in mm_kwargs.get_items(modality)
+        for item in mm_kwargs[modality]
    ]
    return group_mm_kwargs_by_modality(items)
+@contextmanager
+def initialize_dummy_model(model_cls: nn.Module, model_config: ModelConfig):
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
+    initialize_model_parallel(tensor_model_parallel_size=1)
+    vllm_config = VllmConfig(model_config=model_config)
+    with set_current_vllm_config(vllm_config=vllm_config):
+        with set_default_torch_dtype(model_config.dtype):
+            model = model_cls(vllm_config=vllm_config)
+        yield model
+    del model
+    cleanup_dist_env_and_memory()
 def get_model_id_to_test(
        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
    filtered_results = []
@@ -148,12 +174,10 @@ def get_model_id_to_test(
    return filtered_results
-@pytest.mark.core_model
 @pytest.mark.parametrize(
    "model_arch, model_id",
    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-def test_model_tensor_schema(model_arch: str, model_id: str,
+def test_model_tensor_schema(model_arch: str, model_id: str):
-                             vllm_runner: type[VllmRunner], monkeypatch):
    if model_arch in ARCH_TO_SKIP:
        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
    if model_id in REPO_ID_TO_SKIP:
@@ -174,14 +198,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
        tokenizer_mode=model_info.tokenizer_mode,
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
+        hf_overrides=hf_overrides_fn,
    )
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    if not any(
+    inputs_parse_methods = []
-            hasattr(model_cls, f"_parse_and_validate_{m}_input")
+    for attr_name in dir(model_cls):
-            for m in ["image", "video", "audio"]):
+        attr = getattr(model_cls, attr_name)
+        if hasattr(attr, "__annotations__"):
+            return_type = attr.__annotations__.get("return", None)
+            if return_type is not None and "Input" in str(return_type):
+                inputs_parse_methods.append(attr_name)
+    if not any(inputs_parse_methods):
        pytest.skip(f"{model_arch} does not support tensor schema validation.")
    ctx = InputProcessingContext(
@@ -194,68 +224,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
        modality: 3 if limit is None else limit
        for modality, limit in supported_mm_limits.items()
    }
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
-    # Avoid calling model.forward()
+    processor = factories.build_processor(ctx, cache=None)
-    def _initialize_kv_caches_v0(self) -> None:
-        self.cache_config.num_gpu_blocks = 0
+    with initialize_dummy_model(model_cls, model_config) as model:
-        self.cache_config.num_cpu_blocks = 0
+        for modality, _, mm_kwargs in create_batched_mm_kwargs(
+                model_config, processor):
-    def _initialize_kv_caches_v1(self, vllm_config):
+            for method_name in inputs_parse_methods:
-        kv_cache_specs = self.model_executor.get_kv_cache_specs()
+                print(f"Testing `{method_name}` with modality={modality} "
-        scheduler_kv_cache_config = get_kv_cache_config(
+                      f"and mm_kwargs{list(mm_kwargs.keys())}")
-            vllm_config,
+                getattr(model, method_name)(modality=modality, **mm_kwargs)
-            kv_cache_specs[0],
-            10 * GiB_bytes,
-        )
-        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
-        return 1, 0, scheduler_kv_cache_config
-    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-                       _initialize_kv_caches_v0),
-          patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1), monkeypatch.context() as m):
-        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-        if model_info.v0_only:
-            m.setenv("VLLM_USE_V1", "0")
-        # TODO(Isotr0py): Can we avoid initializing engine?
-        with (
-                set_default_torch_num_threads(1),
-                vllm_runner(
-                    model_id,
-                    tokenizer_name=model_info.tokenizer,
-                    tokenizer_mode=model_info.tokenizer_mode,
-                    revision=model_info.revision,
-                    trust_remote_code=model_info.trust_remote_code,
-                    max_model_len=model_info.max_model_len,
-                    load_format="dummy",
-                    hf_overrides=hf_overrides_fn,
-                    limit_mm_per_prompt=limit_mm_per_prompt,
-                    enforce_eager=True,
-                ) as vllm_model,
-        ):
-            model_config = vllm_model.llm.llm_engine.model_config
-            llm_engine = vllm_model.llm.llm_engine
-            if hasattr(llm_engine, "processor"):
-                # v1 processor
-                mm_registry = llm_engine.processor.mm_registry
-            else:
-                # v0 input_preprocessor
-                mm_registry = llm_engine.input_preprocessor.mm_registry
-            processor = mm_registry.create_processor(model_config)
-            def validate_model_input(model, modality: str,
-                                     mm_kwargs: MultiModalKwargs):
-                method_name = f"_parse_and_validate_{modality}_input"
-                if hasattr(model, method_name):
-                    getattr(model, method_name)(**mm_kwargs)
-            for modality, _, mm_kwargs in create_batched_mm_kwargs(
-                    model_config, processor):
-                valid_func = partial(validate_model_input,
-                                     modality=modality,
-                                     mm_kwargs=mm_kwargs)
-                vllm_model.apply_model(valid_func)
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -32,7 +32,7 @@ from ..utils import check_logprobs_close
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
@@ -57,9 +57,6 @@ def test_models(
    numerical sensitive kernels.
    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
        pytest.skip(
            f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -137,6 +137,9 @@ class _HfExamplesInfo:
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
    # [Decoder-only]
+    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
+                                          min_transformers_version="4.56.0",
+                                          trust_remote_code=True),
    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
                                   trust_remote_code=True),
    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
@@ -215,9 +218,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
                                                trust_remote_code=True,
                                                is_available_online=False),
-    "HCXVisionForCausalLM": _HfExamplesInfo(
-        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
-        trust_remote_code=True),
    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                           trust_remote_code=True),
    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@@ -233,6 +233,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                            "tiny": "ai21labs/Jamba-tiny-dev",
                                            "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
                                        }),
+    "Lfm2ForCausalLM": _HfExamplesInfo("LiquidAI/LFM2-1.2B",
+                                       min_transformers_version="4.54"),
    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
                                        extras={"guard": "meta-llama/Llama-Guard-3-1B",  # noqa: E501
                                                "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501
@@ -293,13 +295,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
+    "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
+                                          trust_remote_code=True,
+                                          is_available_online=False),
    "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"),
    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
    "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3",
-                                            trust_remote_code=True,
+                                            trust_remote_code=True),
-                                            is_available_online=False),
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct",
                                        trust_remote_code=True),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@@ -322,8 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
 _EMBEDDING_EXAMPLE_MODELS = {
    # [Text-only]
-    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
+    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
-    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
+    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                               trust_remote_code=True),
@@ -336,9 +340,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
    "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
    "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
-                                trust_remote_code=True, v0_only=True),
+                                trust_remote_code=True),
    "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
-                                               trust_remote_code=True, v0_only=True),  # noqa: E501
+                                               trust_remote_code=True),  # noqa: E501
    "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
                                           max_transformers_version="4.53",
@@ -346,9 +350,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
                                                  max_transformers_version="4.53",
                                                  transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
-    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
+    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
-    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
+    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),  # noqa: E501
    # [Multimodal]
    "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
    "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
@@ -363,16 +367,19 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
    # [Cross-encoder]
-    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
+    "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
-    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
+                                                       trust_remote_code=True,
-    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
+                                                       hf_overrides={
+                                                           "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"), # noqa: E501
+    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
 }
 _AUTOMATIC_CONVERTED_MODELS = {
    # Use as_seq_cls_model for automatic conversion
    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma",  # noqa: E501
-                                                      v0_only=True,
                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
                                                                    "classifier_from_token": ["Yes"],  # noqa: E501
                                                                    "method": "no_post_processing"}),  # noqa: E501
@@ -395,6 +402,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
    "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
+    "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT",  # noqa: E501
+                                                              trust_remote_code=True),
    "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
    "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
@@ -405,22 +414,28 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
    "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),  # noqa: E501
    "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
-                                          is_available_online=False),   # noqa: E501
+                                                        min_transformers_version="4.56"),  # noqa: E501
    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
                                      trust_remote_code=True,
                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
                                      max_transformers_version="4.48",  # noqa: E501
                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
+    "HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",  # noqa: E501
+                                            trust_remote_code=True),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},    # noqa: E501
-                                                        min_transformers_version="4.55.1",
+                                                        min_transformers_version="4.56",
-                                                        transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
+                                                        transformers_version_reason="HF model broken in 4.55"),  # noqa: E501
+    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
+                                                        trust_remote_code=True),  # noqa: E501
    "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                         extras={"2B": "OpenGVLab/InternVL2-2B",
-                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
+                                                 "3.0": "OpenGVLab/InternVL3-1B",   # noqa: E501
-                                         trust_remote_code=True),
+                                                 "3.5-qwen3": "OpenGVLab/InternVL3_5-1B",   # noqa: E501
-    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
+                                                 "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
+                                                 "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
                                         trust_remote_code=True),
+    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -443,7 +458,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                trust_remote_code=True),
    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
-                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4"},  # noqa: E501
+                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"},  # noqa: E501
                                trust_remote_code=True),
    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
                                              trust_remote_code=True,
@@ -464,6 +479,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                            transformers_version_reason="HF model is not compatible",  # noqa: E501
                            extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                    "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
+    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B",
+                               trust_remote_code=True),
    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
@@ -487,14 +504,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                          max_model_len=4096),
    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
+    "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B",
+                                                 trust_remote_code=True),
    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
                                           trust_remote_code=True),
    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct",  # noqa: E501
-                                                       min_transformers_version="4.55.1",
+                                                       min_transformers_version="4.56",
-                                                       transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
+                                                       transformers_version_reason="HF model broken in 4.55"),  # noqa: E501
    "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
-                                                        trust_remote_code=True,
+                                                        trust_remote_code=True),
-                                                        is_available_online=False),
    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                     trust_remote_code=True),
    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),  # noqa: E501
@@ -507,6 +525,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
+    "DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa",  # noqa: E501
+                                                    hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"},  # noqa: E501
+                                                    extras={"dolphin": "ByteDance/Dolphin"}),  # noqa: E501
    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
    # Therefore, we borrow the BartTokenizer from the original Bart model
    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
@@ -530,6 +551,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
                                        speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
                                        trust_remote_code=True),
+    "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random",
+                                        speculative_model="eagle618/eagle-deepseek-v3-random",  # noqa: E501
+                                        trust_remote_code=True),
    "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
@@ -553,6 +577,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                            is_available_online=False,
                                            speculative_model="openbmb/MiniCPM-2B-sft-bf16",
                                            tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
+    "ErnieMTPModel": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
+                                    trust_remote_code=True,
+                                    speculative_model="baidu/ERNIE-4.5-21B-A3B-PT"),
    "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5",
                                        speculative_model="zai-org/GLM-4.5",
                                        min_transformers_version="4.54",
@@ -565,7 +592,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
 _TRANSFORMERS_BACKEND_MODELS = {
    "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
    "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
-    "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
+    "TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
 }
 _EXAMPLE_MODELS = {

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -38,11 +38,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                              model_arch=model_arch,
                              exist_overrides=model_info.hf_overrides)
-    if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
-        from vllm.model_executor.models.llama4 import Llama4ForCausalLM
-        from vllm.model_executor.models.registry import ModelRegistry
-        ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
    # Avoid calling model.forward()
    def _initialize_kv_caches_v0(self) -> None:
        self.cache_config.num_gpu_blocks = 0
@@ -95,6 +90,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
+    if model_arch == "Lfm2ForCausalLM":
+        pytest.skip("Skipping until test supports V1-only models")
    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)

--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
+    # Skip if transformers version is incompatible
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
    # Ensure all model classes can be imported successfully
    model_cls = ModelRegistry._try_load_model_cls(model_arch)
    assert model_cls is not None

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -3,7 +3,8 @@
 import warnings
 from collections.abc import Sequence
-from typing import Any, NamedTuple, Optional, Union
+from dataclasses import dataclass
+from typing import Any, Optional, Union
 import torch
 import torch.nn.functional as F
@@ -339,36 +340,43 @@ def softmax(data):
        return F.softmax(data, dim=-1)
-class EmbedModelInfo(NamedTuple):
+@dataclass
+class ModelInfo:
    name: str
-    is_matryoshka: bool = False
-    matryoshka_dimensions: Optional[list[int]] = None
    architecture: str = ""
    dtype: str = "auto"
+    hf_overrides: Optional[dict[str, Any]] = None
    default_pooling_type: str = ""
    enable_test: bool = True
+@dataclass
+class EmbedModelInfo(ModelInfo):
+    is_matryoshka: bool = False
+    matryoshka_dimensions: Optional[list[int]] = None
+@dataclass
 class CLSPoolingEmbedModelInfo(EmbedModelInfo):
    default_pooling_type: str = "CLS"
+@dataclass
 class LASTPoolingEmbedModelInfo(EmbedModelInfo):
    default_pooling_type: str = "LAST"
-class RerankModelInfo(NamedTuple):
+@dataclass
-    name: str
+class RerankModelInfo(ModelInfo):
-    architecture: str = ""
+    pass
-    dtype: str = "auto"
-    default_pooling_type: str = ""
-    enable_test: bool = True
+@dataclass
 class CLSPoolingRerankModelInfo(RerankModelInfo):
    default_pooling_type: str = "CLS"
+@dataclass
 class LASTPoolingRerankModelInfo(RerankModelInfo):
    default_pooling_type: str = "LAST"

--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import numpy as np
 import pytest
 import torch
-from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
+from vllm.config import ModelConfig, ParallelConfig, VllmConfig
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
+from vllm.multimodal.cache import (MultiModalCache,
-                                    MultiModalKwargsItem,
+                                   MultiModalProcessorCacheItem,
+                                   MultiModalProcessorCacheItemMetadata,
+                                   processor_cache_from_config,
+                                   receiver_cache_from_config)
+from vllm.multimodal.hasher import MultiModalHasher
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                    MultiModalSharedField)
+from vllm.multimodal.processing import PromptInsertion
+from vllm.multimodal.registry import MultiModalRegistry
+def _dummy_elem(
+    modality: str,
+    key: str,
+    size: int,
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
+    if rng is None:
+        data = torch.empty((size, ), dtype=torch.int8)
+    else:
+        data = torch.from_numpy(rng.randint(4, size=(size, ), dtype=np.int8))
-def _dummy_elem(modality: str, key: str, size: int):
    return MultiModalFieldElem(
        modality=modality,
        key=key,
-        data=torch.empty((size, ), dtype=torch.int8),
+        data=data,
        field=MultiModalSharedField(1),
    )
-def _dummy_item(modality: str, size_by_key: dict[str, int]):
+def _dummy_item(
+    modality: str,
+    size_by_key: dict[str, int],
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
    return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+        _dummy_elem(modality, key, size, rng=rng)
+        for key, size in size_by_key.items()
    ])
-def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
+def _dummy_items(
-    return MultiModalKwargs([
+    size_by_key_modality: dict[str, dict[str, int]],
-        _dummy_item(modality, size_by_key)
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
+    return MultiModalKwargsItems.from_seq([
+        _dummy_item(modality, size_by_key, rng=rng)
        for modality, size_by_key in size_by_key_modality.items()
    ])
@@ -37,7 +69,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
    [
        (_dummy_item("a", {"a1": 100}), 100),
        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
-        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+        (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+        (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460),  # noqa: E501
    ],
 )
 # yapf: enable
@@ -47,5 +80,139 @@ def test_cache_item_size(item, expected_size):
    cache[""] = item
    assert cache.currsize == expected_size
-    cache[""] = MultiModalCacheItemMetadata.wraps(item)
+    prompt_update = PromptInsertion("dummy", "target", "insertion") \
+        .resolve(0)
+    cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
+    assert cache.currsize == expected_size
+    cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
    assert cache.currsize == expected_size
+def _create_vllm_config(
+    *,
+    mm_processor_cache_gb: float,
+    enable_ipc: bool,
+):
+    return VllmConfig(
+        model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb),
+        parallel_config=ParallelConfig(
+            data_parallel_size=1 if enable_ipc else 2),
+    )
+def _compare_caches(
+    config_0: VllmConfig,
+    config_1: VllmConfig,
+    *,
+    item_capacity: int = 8,
+    hit_rate: float = 0.5,
+    max_items_per_iter: int = 3,
+    is_cached_calls_per_iter: int,
+    n_iter: int = 100,
+    seed: int = 0,
+):
+    mm_registry = MultiModalRegistry()
+    cache_0_p0 = processor_cache_from_config(config_0, mm_registry)
+    cache_0_p1 = receiver_cache_from_config(config_0, mm_registry)
+    cache_1_p0 = processor_cache_from_config(config_1, mm_registry)
+    cache_1_p1 = receiver_cache_from_config(config_1, mm_registry)
+    cache_size_gb = max(
+        config_0.model_config.mm_processor_cache_gb,
+        config_1.model_config.mm_processor_cache_gb,
+    )
+    item_size_gb = int(cache_size_gb / item_capacity)
+    rng = np.random.RandomState(seed)
+    all_items = [
+        _dummy_item("item", {"key": item_size_gb}, rng=rng)
+        for _ in range(int(item_capacity / hit_rate))
+    ]
+    all_hashes = [
+        MultiModalHasher.hash_kwargs(item=item.get_data())
+        for item in all_items
+    ]
+    # Should not be used since there is nothing to convert to text
+    prompt_update = PromptInsertion("dummy", "target", "insertion")
+    for it in range(n_iter):
+        num_items_to_select = rng.randint(0, max_items_per_iter)
+        item_idxs_to_select = rng.choice(len(all_items), num_items_to_select)
+        selected_items = [all_items[idx] for idx in item_idxs_to_select]
+        selected_hashes = [all_hashes[idx] for idx in item_idxs_to_select]
+        if cache_0_p0 is None:
+            cache_0_p0_out = selected_items
+        else:
+            for _ in range(is_cached_calls_per_iter):
+                cache_0_p0.is_cached(selected_hashes)
+            cache_0_p0_out = [
+                item for item, _ in cache_0_p0.get_and_update(
+                    [(item, prompt_update.content) for item in selected_items],
+                    selected_hashes,
+                )
+            ]
+        if cache_1_p0 is None:
+            cache_1_p0_out = selected_items
+        else:
+            for _ in range(is_cached_calls_per_iter):
+                cache_1_p0.is_cached(selected_hashes)
+            cache_1_p0_out = [
+                item for item, _ in cache_1_p0.get_and_update(
+                    [(item, prompt_update.content) for item in selected_items],
+                    selected_hashes,
+                )
+            ]
+        if cache_0_p1 is None:
+            cache_0_p1_out = cache_0_p0_out
+        else:
+            cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
+                                                       selected_hashes)
+        if cache_1_p1 is None:
+            cache_1_p1_out = cache_1_p0_out
+        else:
+            cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
+                                                       selected_hashes)
+        assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"
+@pytest.mark.parametrize("is_cached_calls_per_iter", [1, 2, 3])
+def test_ipc_enable_disable_consistency(is_cached_calls_per_iter):
+    cache_size_gb = 1 / (1 << 20)
+    vllm_config_ipc_enabled = _create_vllm_config(
+        mm_processor_cache_gb=cache_size_gb,
+        enable_ipc=True,
+    )
+    vllm_config_ipc_disabled = _create_vllm_config(
+        mm_processor_cache_gb=0,
+        enable_ipc=False,
+    )
+    vllm_config_cache_disabled = _create_vllm_config(
+        mm_processor_cache_gb=cache_size_gb,
+        enable_ipc=True,
+    )
+    _compare_caches(
+        vllm_config_ipc_enabled,
+        vllm_config_ipc_disabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
+    _compare_caches(
+        vllm_config_ipc_disabled,
+        vllm_config_cache_disabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
+    _compare_caches(
+        vllm_config_cache_disabled,
+        vllm_config_ipc_enabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -45,10 +45,11 @@ def test_hash_collision_image_transpose():
    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
-def test_hash_collision_tensor_shape():
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_hash_collision_tensor_shape(dtype):
    # The hash should be different though the data is the same when flattened
-    arr1 = torch.zeros((5, 10, 20, 3))
+    arr1 = torch.zeros((5, 10, 20, 3), dtype=dtype)
-    arr2 = torch.zeros((10, 20, 5, 3))
+    arr2 = torch.zeros((10, 20, 5, 3), dtype=dtype)
    hasher = MultiModalHasher
    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)

--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -17,13 +17,11 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
                                        PromptReplacement, apply_text_matches,
                                        apply_token_matches,
                                        find_mm_placeholders,
-                                        find_text_matches, find_token_matches,
                                        iter_token_matches,
                                        replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import full_groupby
 from .utils import random_image
@@ -75,12 +73,15 @@ from .utils import random_image
        ),
    ],
 )
+@pytest.mark.parametrize("start_idx", [0, 4, 8])
 # yapf: enable
-def test_iter_token_matches(token_ids, match_ids, expected):
+def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
-    result = list(iter_token_matches(token_ids, match_ids))
+    result = list(iter_token_matches(token_ids, match_ids,
+                                     start_idx=start_idx))
    # Manually constructed results
-    assert [item._asdict() for item in result] == expected
+    assert [item._asdict() for item in result
+            ] == [item for item in expected if item["start_idx"] >= start_idx]
    # Invariants
    match_lens = [end - start for start, end in result]
@@ -241,21 +242,23 @@ def test_find_token_matches(
    # Should not be used since there is nothing to convert to token IDs
    mock_tokenizer = cast(AnyTokenizer, object())
-    prompt_updates = [
+    prompt_updates = {
-        update_type(key, target, []).bind(mock_tokenizer)
+        key: update_type(key, target, []).resolve(0)
        for key, target in target_by_key.items()
-    ]
+    }
-    result = find_token_matches(prompt, prompt_updates)
+    result = {
+        key: list(update.iter_token_matches(prompt, mock_tokenizer))
+        for key, update in prompt_updates.items()
+    }
    # Only displayed on error
    print("result:", result)
    # Manually constructed results
-    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
    assert {
        key: [
            dict(start_idx=item.start_idx, end_idx=item.end_idx)
-            for item in result_groups.get(key, [])
+            for item in result.get(key, [])
        ]
        for key in expected_by_key
    } == expected_by_key
@@ -388,21 +391,23 @@ def test_find_text_matches(
    # Should not be used since there is nothing to convert to text
    mock_tokenizer = cast(AnyTokenizer, object())
-    prompt_updates = [
+    prompt_updates = {
-        update_type(key, target, []).bind(mock_tokenizer)
+        key: update_type(key, target, []).resolve(0)
        for key, target in target_by_key.items()
-    ]
+    }
-    result = find_text_matches(prompt, prompt_updates)
+    result = {
+        key: list(update.iter_text_matches(prompt, mock_tokenizer))
+        for key, update in prompt_updates.items()
+    }
    # Only displayed on error
    print("result:", result)
    # Manually constructed results
-    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
    assert {
        key: [
            dict(start_idx=item.start_idx, end_idx=item.end_idx)
-            for item in result_groups.get(key, [])
+            for item in result.get(key, [])
        ]
        for key in expected_by_key
    } == expected_by_key
@@ -552,39 +557,35 @@ def test_find_update_text(
            update_type,
            expected_by_mm_count,
    ) in expected_by_update_type_mm_count.items():
-        mm_prompt_updates = {
-            key:
-            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
-            for key, target in target_by_key.items()
-        }
-        mm_matches = {
-            key: find_text_matches(prompt, updates)
-            for key, updates in mm_prompt_updates.items()
-        }
        for mm_count, expected in expected_by_mm_count.items():
-            result = apply_text_matches(
+            mm_prompt_updates = {
+                key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
+                      for i in range(mm_count)]
+                for key, target in target_by_key.items()
+            }
+            new_prompt, result = apply_text_matches(
                prompt,
-                mm_matches,
+                mm_prompt_updates,
-                {key: mm_count
+                mock_tokenizer,
-                 for key in repl_by_key},
            )
            # Only displayed on error
            print("update_type:", update_type)
            print("mm_count:", mm_count)
-            print("mm_matches:", mm_matches)
+            print("mm_prompt_updates:", mm_prompt_updates)
+            print("new_prompt:", new_prompt)
            print("result:", result)
            # Manually constructed results
-            assert result == expected
+            assert new_prompt == expected
 # yapf: disable
 @pytest.mark.parametrize(
    ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
    [
-        # Tokenized test cases of `test_find_replace_text`
+        # Tokenized test cases of `test_find_update_text`
        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
        (
            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
@@ -726,32 +727,28 @@ def test_find_update_tokens(
            update_type,
            expected_by_mm_count,
    ) in expected_by_update_type_mm_count.items():
-        mm_prompt_updates = {
-            key:
-            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
-            for key, target in target_by_key.items()
-        }
-        mm_matches = {
-            key: find_token_matches(prompt, updates)
-            for key, updates in mm_prompt_updates.items()
-        }
        for mm_count, expected in expected_by_mm_count.items():
-            result = apply_token_matches(
+            mm_prompt_updates = {
+                key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
+                      for i in range(mm_count)]
+                for key, target in target_by_key.items()
+            }
+            new_prompt, result = apply_token_matches(
                prompt,
-                mm_matches,
+                mm_prompt_updates,
-                {key: mm_count
+                mock_tokenizer,
-                 for key in repl_by_key},
            )
            # Only displayed on error
            print("update_type:", update_type)
            print("mm_count:", mm_count)
-            print("mm_matches:", mm_matches)
+            print("mm_prompt_updates:", mm_prompt_updates)
+            print("new_prompt:", new_prompt)
            print("result:", result)
            # Manually constructed results
-            assert result == expected
+            assert new_prompt == expected
 # yapf: disable
@@ -878,17 +875,11 @@ def test_find_mm_placeholders(
    mock_tokenizer = cast(AnyTokenizer, object())
    mm_prompt_updates = {
-        key: [update_type(key, [], repl).bind(mock_tokenizer)]
+        key: [[update_type(key, [], repl).resolve(i)] for i in range(3)]
        for key, repl in repl_by_key.items()
    }
-    result = find_mm_placeholders(
+    result = find_mm_placeholders(prompt, mm_prompt_updates, mock_tokenizer)
-        mm_prompt_updates,
-        prompt,
-        # Effectively match all occurrences in the prompt
-        {key: 3
-         for key in repl_by_key},
-    )
    # Only displayed on error
    print("result:", result)