Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -7,18 +7,21 @@ from typing import Callable, Optional, Union

 import torch

+from vllm.multimodal.audio import AudioResampler
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                   sample_frames_from_video)

-from .....conftest import _ImageAssets, _VideoAssets
-from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
+from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
+from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
+                    TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
                    TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
-                    ImageSizeWrapper, SizeType, VLMTestInfo)
+                    ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
+                    VLMTestInfo)


-def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
-                                                                      str],
+def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
+                                                                     str],
                             test_placeholder: str) -> str:
    """Given a prompt, replaces each test placeholder with the
    model-specific tag.
@@ -26,7 +29,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
    prompt_segments = prompt.split(test_placeholder)
    img_prompt = prompt_segments[0]
    for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
-        img_prompt += img_idx_to_prompt(placeholder_idx)
+        img_prompt += mm_idx_to_prompt(placeholder_idx)
        img_prompt += next_seg
    return img_prompt

@@ -34,6 +37,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
 def get_model_prompts(base_prompts: Iterable[str],
                      img_idx_to_prompt: Optional[Callable[[int], str]],
                      video_idx_to_prompt: Optional[Callable[[int], str]],
+                      audio_idx_to_prompt: Optional[Callable[[int], str]],
                      prompt_formatter: Callable[[str], str]) -> list[str]:
    """Given a model-agnostic base prompt and test configuration for a model(s)
    to be tested, update the media placeholders and apply the prompt formatting
@@ -60,6 +64,11 @@ def get_model_prompts(base_prompts: Iterable[str],
                                                   video_idx_to_prompt,
                                                   TEST_VIDEO_PLACEHOLDER)

+        if audio_idx_to_prompt:
+            base_prompt = replace_test_placeholder(base_prompt,
+                                                   audio_idx_to_prompt,
+                                                   TEST_AUDIO_PLACEHOLDER)
+
        # Apply the prompt formatter to wrap the base prompt with
        # the correct media placeholders to get the model test prompt
        model_prompt = prompt_formatter(base_prompt)
@@ -68,10 +77,11 @@ def get_model_prompts(base_prompts: Iterable[str],


 def build_single_image_inputs_from_test_info(
-        test_info: VLMTestInfo,
-        image_assets: _ImageAssets,
-        size_wrapper: ImageSizeWrapper,
-        tmp_path: Optional[PosixPath] = None):
+    test_info: VLMTestInfo,
+    image_assets: ImageTestAssets,
+    size_wrapper: ImageSizeWrapper,
+    tmp_path: Optional[PosixPath] = None,
+) -> list[PromptWithMultiModalInput]:
    if test_info.prompt_formatter is None:
        raise ValueError(
            "Prompt formatter must be set to build single image inputs")
@@ -79,6 +89,7 @@ def build_single_image_inputs_from_test_info(
    model_prompts = get_model_prompts(test_info.single_image_prompts,
                                      test_info.img_idx_to_prompt,
                                      test_info.video_idx_to_prompt,
+                                      test_info.audio_idx_to_prompt,
                                      test_info.prompt_formatter)

    # For models that require a local path / URL encoded in the image; export
@@ -97,28 +108,32 @@ def build_single_image_inputs_from_test_info(
    return build_single_image_inputs(images, model_prompts, size_wrapper)


-def build_single_image_inputs(images, model_prompts,
-                              size_wrapper: ImageSizeWrapper):
+def build_single_image_inputs(
+        images, model_prompts,
+        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
    # For every image / prompt pair, get a pair containing two lists of
    # length size_factors, where the first contains duplicates of the model
    # prompt [str], and the second contains copies of the image after being
    # scaled by one of the size factors.
    #
    # NOTE: rescaling preserves the image aspect ratio.
-    return [(
-        [prompt for _ in size_wrapper.data],
-        [
-            apply_image_size_scaling(image, size, size_wrapper.type)
-            for size in size_wrapper.data
-        ],
-    ) for image, prompt in zip(images, model_prompts)]
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            image_data=[
+                apply_image_size_scaling(image, size, size_wrapper.type)
+                for size in size_wrapper.data
+            ],
+        ) for image, prompt in zip(images, model_prompts)
+    ]


 def build_multi_image_inputs_from_test_info(
-        test_info: VLMTestInfo,
-        image_assets: _ImageAssets,
-        size_wrapper: ImageSizeWrapper,
-        tmp_path: Optional[PosixPath] = None):
+    test_info: VLMTestInfo,
+    image_assets: ImageTestAssets,
+    size_wrapper: ImageSizeWrapper,
+    tmp_path: Optional[PosixPath] = None,
+) -> list[PromptWithMultiModalInput]:
    if test_info.prompt_formatter is None:
        raise ValueError(
            "Prompt formatter must be set to build multi image inputs")
@@ -126,6 +141,7 @@ def build_multi_image_inputs_from_test_info(
    model_prompts = get_model_prompts([test_info.multi_image_prompt],
                                      test_info.img_idx_to_prompt,
                                      test_info.video_idx_to_prompt,
+                                      test_info.audio_idx_to_prompt,
                                      test_info.prompt_formatter)

    if test_info.prompt_path_encoder is not None:
@@ -146,20 +162,23 @@ def build_multi_image_inputs_from_test_info(
    )


-def build_multi_image_inputs(image_lists, model_prompts,
-                             size_wrapper: ImageSizeWrapper):
-    return [(
-        [prompt for _ in size_wrapper.data],
-        [[
-            apply_image_size_scaling(image, size, size_wrapper.type)
-            for image in images
-        ] for size in size_wrapper.data],
-    ) for images, prompt in zip(image_lists, model_prompts)]
+def build_multi_image_inputs(
+        image_lists, model_prompts,
+        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            image_data=[[
+                apply_image_size_scaling(image, size, size_wrapper.type)
+                for image in images
+            ] for size in size_wrapper.data],
+        ) for images, prompt in zip(image_lists, model_prompts)
+    ]


 def build_embedding_inputs_from_test_info(
    test_info: VLMTestInfo,
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    size_wrapper: ImageSizeWrapper,
 ):
    # These conditions will always be true if invoked through filtering,
@@ -177,6 +196,7 @@ def build_embedding_inputs_from_test_info(
        SINGLE_IMAGE_BASE_PROMPTS,
        test_info.img_idx_to_prompt,
        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
        test_info.prompt_formatter,
    )

@@ -192,16 +212,17 @@ def build_embedding_inputs_from_test_info(

 def build_video_inputs_from_test_info(
    test_info: VLMTestInfo,
-    video_assets: _VideoAssets,
+    video_assets: VideoTestAssets,
    size_wrapper: ImageSizeWrapper,
    num_frames: int,
-):
+) -> list[PromptWithMultiModalInput]:
    if test_info.prompt_formatter is None:
        raise ValueError("Prompt formatter must be set to build video inputs")
    model_prompts = get_model_prompts(
        [VIDEO_BASE_PROMPT],
        test_info.img_idx_to_prompt,
        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
        test_info.prompt_formatter,
    )

@@ -213,10 +234,14 @@ def build_video_inputs_from_test_info(
    video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
                    else rescale_video_size)

-    return [(
-        [prompt for _ in size_wrapper.data],
-        [video_scaler(video, size) for size in size_wrapper.data],
-    ) for video, prompt in zip(sampled_vids, model_prompts)]
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            video_data=[
+                video_scaler(video, size) for size in size_wrapper.data
+            ],
+        ) for video, prompt in zip(sampled_vids, model_prompts)
+    ]


 def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
@@ -236,3 +261,37 @@ def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
        # We have a list of fixed sizes
        return image.resize(size)
    raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
+
+
+def build_audio_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    audio_assets: AudioTestAssets,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build audio inputs")
+    model_prompts = get_model_prompts(
+        SINGLE_AUDIO_BASE_PROMPT,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+    resampler = AudioResampler(
+        target_sr=16000,
+        method="librosa",
+    )
+    audios = [asset.audio_and_sample_rate for asset in audio_assets]
+    resampled_audios = [(
+        resampler.resample(
+            audio,
+            orig_sr=sr,
+        ),
+        int(resampler.target_sr),
+    ) for audio, sr in audios]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=model_prompts,
+            audio_data=resampled_audios,
+        )
+    ]
--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -83,7 +83,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
                test_info.num_video_frames)

        # No sizes passed for custom inputs, since inputs are directly provided
-        if test_type != VLMTestType.CUSTOM_INPUTS:
+        if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
            if wrapped_sizes is None:
                raise ValueError(
@@ -91,7 +91,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
            iter_kwargs["size_wrapper"] = wrapped_sizes

        #Otherwise expand the custom test options instead
-        else:
+        elif test_type == VLMTestType.CUSTOM_INPUTS:
            if test_info.custom_test_opts is None:
                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
            iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
@@ -136,8 +136,8 @@ def get_wrapped_test_sizes(
            ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
            for factor in EMBEDDING_SIZE_FACTORS
        ])
-    # Custom inputs have preprocessed inputs
-    elif test_type == VLMTestType.CUSTOM_INPUTS:
+    # Audio and Custom inputs have preprocessed inputs
+    elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
        return tuple()

    size_factors = test_info.image_size_factors \

--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
 # SPDX-License-Identifier: Apache-2.0
 """Core test implementation to be shared across modalities."""
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional

 import torch
-from PIL.Image import Image
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from vllm.config import TaskOption
@@ -11,14 +10,14 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer

 from .....conftest import HfRunner, VllmRunner
 from ....registry import HF_EXAMPLE_MODELS
-from .types import RunnerOutput
+from .types import PromptWithMultiModalInput, RunnerOutput


 def run_test(
    *,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
-    inputs: list[tuple[list[str], list[Union[list[Image], Image]]]],
+    inputs: list[PromptWithMultiModalInput],
    model: str,
    dtype: str,
    max_tokens: int,
@@ -38,7 +37,6 @@ def run_test(
    hf_model_kwargs: Optional[dict[str, Any]],
    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
    task: TaskOption = "auto",
-    runner_mm_key: str = "images",
    distributed_executor_backend: Optional[str] = None,
    tensor_parallel_size: int = 1,
    vllm_embeddings: Optional[torch.Tensor] = None,
@@ -67,7 +65,7 @@ def run_test(
        "disable_mm_preprocessor_cache": True,
    }
    if model_info.tokenizer:
-        vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
+        vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
    if model_info.tokenizer_mode:
        vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
    if model_info.hf_overrides:
@@ -94,10 +92,16 @@ def run_test(
        if stop_str:
            vllm_kwargs["stop"] = stop_str

-        for prompts, media in vllm_inputs:
-            vllm_kwargs[runner_mm_key] = media
+        for prompts, image_data, video_data, audio_data in vllm_inputs:
+            mm_data = dict(images=image_data,
+                           videos=video_data,
+                           audios=audio_data)
+            vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
            vllm_output = vllm_model.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs)
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                **vllm_kwargs_with_mm_data)
            vllm_outputs_per_mm.append(vllm_output)

    hf_model = hf_runner(model,
@@ -122,14 +126,17 @@ def run_test(
        if stop_str:
            hf_kwargs["stop_strings"] = stop_str

-        for prompts, media in inputs:
-            hf_kwargs[runner_mm_key] = media
+        for prompts, image_data, video_data, audio_data in inputs:
+            mm_data = dict(images=image_data,
+                           videos=video_data,
+                           audios=audio_data)
+            hf_kwargs_with_mm_data = hf_kwargs | mm_data
            hf_output = hf_model.generate_greedy_logprobs_limit(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                tokenizer=tokenizer,
-                **hf_kwargs)
+                **hf_kwargs_with_mm_data)
            hf_outputs_per_mm.append(hf_output)

    # Apply output processing / sanitation to the vLLM and HF runner results

--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -12,7 +12,7 @@ from vllm.multimodal.video import (rescale_video_size, resize_video,

 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs
-from .types import ImageSizeWrapper, SizeType
+from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType


 def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
@@ -32,24 +32,28 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
        "<image>\nWhat is the season?",
    ]
    formatted_prompts = [formatter(prompt) for prompt in img_prompts]
-
-    return [(
-        formatted_prompts,
+    aspect_ratio_images = [
+        [stop_sign, cherry_blossom],
+        # Images with different sizes and aspect-ratios
+        [
+            rescale_image_size(stop_sign, 0.1),
+            stop_sign,
+        ],
        [
-            [stop_sign, cherry_blossom],
-            # Images with different sizes and aspect-ratios
-            [
-                rescale_image_size(stop_sign, 0.1),
-                stop_sign,
-            ],
-            [
-                stop_sign,
-                rescale_image_size(stop_sign, 0.25),
-                cherry_blossom.resize((183, 488)),
-                cherry_blossom.resize((488, 183))
-            ],
-            cherry_blossom,
-        ])]
+            stop_sign,
+            rescale_image_size(stop_sign, 0.25),
+            cherry_blossom.resize((183, 488)),
+            cherry_blossom.resize((488, 183))
+        ],
+        cherry_blossom,
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=formatted_prompts,
+            image_data=aspect_ratio_images,
+        )
+    ]


 def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
@@ -68,24 +72,28 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
        "<video>\nWhy is this video funny?",
    ]
    formatted_prompts = [formatter(prompt) for prompt in video_prompts]
-
-    return [(
-        formatted_prompts,
+    aspect_ratio_videos = [
+        [video, video],
+        # Videos with different sizes and aspect-ratios
        [
-            [video, video],
-            # Videos with different sizes and aspect-ratios
-            [
-                rescale_video_size(video, 0.1),
-                video,
-            ],
-            [
-                video,
-                rescale_video_size(video, 0.25),
-                resize_video(video, (183, 488)),
-                resize_video(video, (488, 183))
-            ],
+            rescale_video_size(video, 0.1),
            video,
-        ])]
+        ],
+        [
+            video,
+            rescale_video_size(video, 0.25),
+            resize_video(video, (183, 488)),
+            resize_video(video, (488, 183))
+        ],
+        video,
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=formatted_prompts,
+            video_data=aspect_ratio_videos,
+        )
+    ]


 def different_patch_input_cases_internvl():

--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -16,7 +16,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side

-from .....conftest import HfRunner, ImageAsset, _ImageAssets
+from .....conftest import HfRunner, ImageAsset, ImageTestAssets
 from .types import RunnerOutput


@@ -229,15 +229,35 @@ def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
    return output_ids, output_str, out_logprobs


+def minimax_vl_01_hf_output(hf_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<end_of_sentence>"):
+        output_str = output_str.split("<end_of_sentence>")[0]
+    return output_ids, output_str, out_logprobs
+
+
+def ultravox_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+    eos_token = tokenizer.decode(eos_token_id)
+    if output_str.endswith(eos_token):
+        output_str = output_str.split(eos_token)[0]
+    return output_ids, output_str, out_logprobs
+
+
 ####### Functions for converting image assets to embeddings
-def get_llava_embeddings(image_assets: _ImageAssets):
+def get_llava_embeddings(image_assets: ImageTestAssets):
    return [asset.image_embeds for asset in image_assets]


 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
-                                                        _ImageAssets]) -> str:
+        tmp_path: PosixPath, prompt: str,
+        assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
    """Given a temporary dir path, export one or more image assets into the
    tempdir & replace its contents with the local path to the string so that
    the HF version of Qwen-VL can resolve the path and load the image in its
@@ -627,6 +647,17 @@ def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model


+def minimax_vl_01_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
 def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Molmo."""
    hf_processor = hf_model.processor
@@ -657,3 +688,46 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    hf_model.model.generate = types.MethodType(_generate, hf_model.model)

    return hf_model
+
+
+def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Ovis2."""
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.llm.get_output_embeddings()
+
+    def processor(*args, text="", images=None, **kwargs):
+        text_tokenizer = hf_model.model.get_text_tokenizer()
+        images = [images] if isinstance(images, Image) else images
+
+        prompt_start_and_end = {
+            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "llama":
+            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+        }
+        for start, end in prompt_start_and_end.values():
+            if start in text and end in text:
+                text = text.split(start)[1].split(end)[0]
+                break
+
+        prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
+            text_or_conversations=text, images=images)
+        attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
+
+        inputs = {
+            "inputs": input_ids.unsqueeze(0),
+            "pixel_values": pixel_values.unsqueeze(0),
+            "attention_mask": attention_mask.unsqueeze(0),
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+
+    hf_model.processor = processor
+    return hf_model
+
+
+def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
+    thinker = hf_model.model.thinker
+    thinker.get_output_embeddings = lambda: thinker.lm_head
+    hf_model.model = thinker
+    return hf_model
--- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
@@ -4,7 +4,8 @@ types / modalities.
 """
 from pathlib import PosixPath

-from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
+from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
+                           VideoTestAssets, VllmRunner)
 from . import builders, core
 from .types import ExpandableVLMTestArgs, VLMTestInfo

@@ -14,7 +15,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
                          test_case: ExpandableVLMTestArgs,
                          hf_runner: type[HfRunner],
                          vllm_runner: type[VllmRunner],
-                          image_assets: _ImageAssets):
+                          image_assets: ImageTestAssets):
    assert test_case.size_wrapper is not None
    inputs = builders.build_single_image_inputs_from_test_info(
        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
@@ -29,7 +30,6 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"image": 1},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        runner_mm_key="images",
        **model_test_info.get_non_parametrized_runner_kwargs())


@@ -37,7 +37,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
                         test_case: ExpandableVLMTestArgs,
                         hf_runner: type[HfRunner],
                         vllm_runner: type[VllmRunner],
-                         image_assets: _ImageAssets):
+                         image_assets: ImageTestAssets):
    assert test_case.size_wrapper is not None
    inputs = builders.build_multi_image_inputs_from_test_info(
        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
@@ -52,7 +52,6 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"image": len(image_assets)},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        runner_mm_key="images",
        **model_test_info.get_non_parametrized_runner_kwargs())


@@ -60,7 +59,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
                       test_case: ExpandableVLMTestArgs,
                       hf_runner: type[HfRunner],
                       vllm_runner: type[VllmRunner],
-                       image_assets: _ImageAssets):
+                       image_assets: ImageTestAssets):
    assert test_case.size_wrapper is not None
    inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
        model_test_info, image_assets, test_case.size_wrapper)
@@ -76,7 +75,6 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
        limit_mm_per_prompt={"image": 1},
        vllm_embeddings=vllm_embeddings,
        distributed_executor_backend=test_case.distributed_executor_backend,
-        runner_mm_key="images",
        **model_test_info.get_non_parametrized_runner_kwargs())


@@ -86,7 +84,7 @@ def run_video_test(
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
-    video_assets: _VideoAssets,
+    video_assets: VideoTestAssets,
 ):
    assert test_case.size_wrapper is not None
    assert test_case.num_video_frames is not None
@@ -104,7 +102,30 @@ def run_video_test(
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"video": len(video_assets)},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        runner_mm_key="videos",
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_audio_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    inputs = builders.build_audio_inputs_from_test_info(
+        model_test_info, audio_assets)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"audio": 1},
+        distributed_executor_backend=test_case.distributed_executor_backend,
        **model_test_info.get_non_parametrized_runner_kwargs())


@@ -119,11 +140,9 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,

    inputs = test_case.custom_test_opts.inputs
    limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
-    runner_mm_key = test_case.custom_test_opts.runner_mm_key
-    # Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
+    # Inputs and limit_mm_per_prompt should all be set
    assert inputs is not None
    assert limit_mm_per_prompt is not None
-    assert runner_mm_key is not None

    core.run_test(
        hf_runner=hf_runner,
@@ -135,5 +154,4 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt=limit_mm_per_prompt,
        distributed_executor_backend=test_case.distributed_executor_backend,
-        runner_mm_key=runner_mm_key,
        **model_test_info.get_non_parametrized_runner_kwargs())
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -6,7 +6,6 @@ from pathlib import PosixPath
 from typing import Any, Callable, NamedTuple, Optional, Union

 import torch
-from PIL.Image import Image
 from pytest import MarkDecorator
 from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
@@ -15,18 +14,25 @@ from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer

-from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
+from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
+                           ImageTestAssets, PromptAudioInput, PromptImageInput,
+                           PromptVideoInput)
 from ....utils import check_logprobs_close

 # meta image tag; will be replaced by the appropriate tag for the model
 TEST_IMG_PLACEHOLDER = "<vlm_image>"
 TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
+TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"

 # yapf: disable
 SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
    "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
 })
+SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts({
+    "mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.",    # noqa: E501
+    "winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?",     # noqa: E501
+})

 MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n"  # noqa: E501
 VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
@@ -38,12 +44,21 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
 # yapf: enable


+class PromptWithMultiModalInput(NamedTuple):
+    """Holds the multimodal input for a single test case."""
+    prompts: list[str]
+    image_data: Optional[PromptImageInput] = None
+    video_data: Optional[PromptVideoInput] = None
+    audio_data: Optional[PromptAudioInput] = None
+
+
 class VLMTestType(Enum):
    IMAGE = 1
    MULTI_IMAGE = 2
    EMBEDDING = 3
    VIDEO = 4
-    CUSTOM_INPUTS = 5
+    AUDIO = 5
+    CUSTOM_INPUTS = 6


 class SizeType(Enum):
@@ -52,10 +67,8 @@ class SizeType(Enum):


 class CustomTestOptions(NamedTuple):
-    inputs: list[tuple[list[str], list[Union[list[Image], Image]]]]
+    inputs: list[PromptWithMultiModalInput]
    limit_mm_per_prompt: dict[str, int]
-    # kwarg to pass multimodal data in as to vllm/hf runner instances.
-    runner_mm_key: str = "images"


 class ImageSizeWrapper(NamedTuple):
@@ -75,6 +88,7 @@ class VLMTestInfo(NamedTuple):
    prompt_formatter: Optional[Callable[[str], str]] = None
    img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
    video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
+    audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"

    # Most models work on the single / multi-image prompts above, but in some
    # cases the log prob check fails, e.g., for paligemma. We allow passing
@@ -85,7 +99,7 @@ class VLMTestInfo(NamedTuple):

    # Function for converting ImageAssets to image embeddings;
    # We need to define this explicitly for embedding tests
-    convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
+    convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
                                                    torch.Tensor]] = None

    # Exposed options for vLLM runner; we change these in a several tests,
@@ -141,7 +155,7 @@ class VLMTestInfo(NamedTuple):
    # for Qwen-VL, which requires encoding the image path / url into the prompt
    # for HF runner
    prompt_path_encoder: Optional[
-        Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]],
+        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
                 str]] = None  # noqa: E501

    # Allows configuring a test to run with custom inputs

--- a/tests/models/embedding/__init__.py
+++ b/tests/models/embedding/__init__.py
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -10,7 +10,7 @@ from transformers import Qwen2VLForConditionalGeneration

 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close

 HF_TEXT_PROMPTS = [
    # T -> X

--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
 # SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional
-
 import pytest
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor

-from ....conftest import _ImageAssets
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+
+from ....conftest import ImageTestAssets

 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
 DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]


+@torch.inference_mode()
 def run_intern_vit_test(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    model_id: str,
    *,
    dtype: str,
-    distributed_executor_backend: Optional[str] = None,
 ):
    model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
+    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]

    img_processor = CLIPImageProcessor.from_pretrained(model)
    images = [asset.pil_image for asset in image_assets]
    pixel_values = [
-        img_processor(images, return_tensors='pt').pixel_values.to(dtype)
+        img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
        for images in images
    ]

@@ -36,14 +37,13 @@ def run_intern_vit_test(
        config.norm_type = "rms_norm"

    hf_model = AutoModel.from_pretrained(model,
-                                         torch_dtype=dtype,
+                                         torch_dtype=torch_dtype,
                                         trust_remote_code=True).to("cuda")
    hf_outputs_per_image = [
        hf_model(pixel_value.to("cuda")).last_hidden_state
        for pixel_value in pixel_values
    ]

-    from vllm.distributed import cleanup_dist_env_and_memory
    from vllm.model_executor.models.intern_vit import InternVisionModel
    vllm_model = InternVisionModel(config)
    vllm_model.load_weights(hf_model.state_dict().items())
@@ -51,7 +51,7 @@ def run_intern_vit_test(
    del hf_model
    cleanup_dist_env_and_memory()

-    vllm_model = vllm_model.to("cuda", dtype)
+    vllm_model = vllm_model.to("cuda", torch_dtype)
    vllm_outputs_per_image = [
        vllm_model(pixel_values=pixel_value.to("cuda"))
        for pixel_value in pixel_values
@@ -69,8 +69,7 @@ def run_intern_vit_test(
    "OpenGVLab/InternViT-300M-448px",
    "OpenGVLab/InternViT-6B-448px-V1-5",
 ])
-@pytest.mark.parametrize("dtype", [torch.half])
-@torch.inference_mode()
+@pytest.mark.parametrize("dtype", ["half"])
 def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
    run_intern_vit_test(
        image_assets,

--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -8,7 +8,7 @@ from vllm.platforms import current_platform

 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close

 # Llava Next embedding implementation is only supported by CUDA.
 # If run on ROCm, hf_model.model.resize_token_embeddings will

--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -9,7 +9,7 @@ from vllm.assets.image import VLM_IMAGES_DIR

 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close

 HF_TEXT_PROMPTS = [
    # T -> X

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -146,7 +146,8 @@ def _test_processing_correctness_hf(
    batch_idx: int,
    ignore_mm_keys: Optional[set[str]] = None,
 ):
-    if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
+    if model_config.hf_config.model_type in ("mllama", "ovis", "ultravox",
+                                             "whisper"):
        # For some multimodal models, tokenizer will always add bos_token
        # at the beginning of prompt by default, causing hf_processor outputs
        # incorrect token ids. So we need use `add_special_tokens=False` here
@@ -270,9 +271,13 @@ def _test_processing_correctness_mistral(
    "openbmb/MiniCPM-Llama3-V-2_5",
    "openbmb/MiniCPM-o-2_6",
    "openbmb/MiniCPM-V-2_6",
+    "MiniMaxAI/MiniMax-VL-01",
    "allenai/Molmo-7B-D-0924",
    "allenai/Molmo-7B-O-0924",
    "nvidia/NVLM-D-72B",
+    "AIDC-AI/Ovis1.6-Gemma2-9B",
+    "AIDC-AI/Ovis1.6-Llama3.2-3B",
+    "AIDC-AI/Ovis2-1B",
    "google/paligemma-3b-mix-224",
    "google/paligemma2-3b-ft-docci-448",
    "microsoft/Phi-4-multimodal-instruct",
@@ -282,7 +287,7 @@ def _test_processing_correctness_mistral(
    "Qwen/Qwen2-VL-2B-Instruct",
    "Qwen/Qwen2.5-VL-3B-Instruct",
    "Qwen/Qwen2-Audio-7B-Instruct",
-    "Qwen/Qwen2.5-Omni-7B",
+    "Qwen/Qwen2.5-Omni-3B",
    "Skywork/Skywork-R1V-38B",
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
    "openai/whisper-large-v3",

--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor

-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context


@@ -137,7 +137,7 @@ def _run_check(
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
    model_id: str,
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    size_factors: list[int],
    min_dynamic_patch: int,
    max_dynamic_patch: int,

--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -5,7 +5,7 @@ from transformers import Idefics3Config

 from vllm.multimodal import MULTIMODAL_REGISTRY

-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context


@@ -21,7 +21,7 @@ from ...utils import build_model_context
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    model_id: str,
    mm_processor_kwargs: dict[str, object],
    expected_toks_per_img: int,

--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor

-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context


@@ -94,7 +94,7 @@ def _run_check(
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
    model_id: str,
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    size_factors: list[int],
    min_dynamic_patch: int,
    max_dynamic_patch: int,

--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -6,7 +6,7 @@ import pytest
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.transformers_utils.tokenizer import encode_tokens

-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context


@@ -17,7 +17,7 @@ from ...utils import build_model_context
 @pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
 @pytest.mark.parametrize("tokenized_prompt", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    model_id: str,
    mm_processor_kwargs: dict,
    num_imgs: int,

--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from PIL import Image
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=(364, 364))
+    mm_data = {"image": [image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, {})
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+
+    assert len(image_placeholders) == num_imgs
+
+
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    for size in image_sizes:
+        _validate_image_prompt_replacements_one(processor, num_imgs,
+                                                failed_size_excs, size)
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
+@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -4,7 +4,7 @@ import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY

-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context


@@ -22,7 +22,7 @@ from ...utils import build_model_context
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    model_id: str,
    mm_processor_kwargs: dict[str, int],
    expected_toks_per_img: int,

--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -4,7 +4,7 @@ import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY

-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context


@@ -22,7 +22,7 @@ from ...utils import build_model_context
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    model_id: str,
    mm_processor_kwargs: dict[str, int],
    expected_toks_per_img: int,