[tests] fix tests

04629132 · zhuwenwen · 07c69390 · 04629132 · 04629132 · 04629132
Commit 04629132 authored Jun 12, 2025 by zhuwenwen
20 changed files
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
-import os
-import pytest
-from vllm.inputs import InputContext
-from ....utils import build_model_context
-from .....utils import models_path_prefix
-@pytest.fixture()
-def get_max_llava_next_image_tokens():
-    from vllm.model_executor.models.llava_next import (
-        get_max_llava_next_image_tokens)
-    return get_max_llava_next_image_tokens
-@pytest.fixture()
-def dummy_data_for_llava_next():
-    from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
-    return dummy_data_for_llava_next
-@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
-    ([[336, 336]], 1176),
-    ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
-])
-def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
-                                         get_max_llava_next_image_tokens):
-    ctx = build_model_context(model_name=os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"))
-    # Update the config image_grid_pinpoints
-    # and calculate the resulting max tokens
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-    actual_max_tokens = get_max_llava_next_image_tokens(
-        InputContext(ctx.model_config))
-    assert expected_max_tokens == actual_max_tokens
-@pytest.mark.parametrize(
-    "gridpoints,expected_size",
-    [
-        # One point; it has to be the largest
-        ([[336, 336]], (336, 336)),
-        # Default for most llava next models; the 2x2 tile is the largest
-        ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
-         (672, 672)),
-        # If two rectangular gridpoints are the same, the more vertical
-        # one has the higher feature count due to newline features
-        ([[336, 672], [672, 336]], (672, 336))
-    ])
-def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
-                                                gridpoints, expected_size):
-    ctx = build_model_context(model_name=os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"))
-    # Update the config image_grid_pinpoints
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-    seq_len = 5000  # bigger than the max feature size for any image
-    dummy_data = dummy_data_for_llava_next(
-        ctx,
-        seq_len=seq_len,
-        mm_counts={"image": 1},
-    )
-    seq_data = dummy_data.seq_data
-    mm_data = dummy_data.multi_modal_data
-    # The dummy data dims should match the gridpoint with the biggest feat size
-    assert mm_data["image"].height == expected_size[0]
-    assert mm_data["image"].width == expected_size[1]
-    assert len(seq_data.get_token_ids()) >= seq_len
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
-"""Tests for phi3v's multimodal preprocessing kwargs."""
-from typing import Optional
-import os
-import pytest
-from transformers import AutoTokenizer
-from vllm.inputs import InputContext, InputProcessingContext
-from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
-from .....utils import models_path_prefix
-models = [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")]
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
-    return Phi3VMultiModalProcessor
-@pytest.fixture()
-def get_max_phi3v_image_tokens():
-    from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
-    return get_max_phi3v_image_tokens
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_max_tokens", [
-    (4, 781),
-    (16, 2653),
-])
-def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
-                             num_crops: int, expected_max_tokens: int):
-    """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
-    # NOTE: mm_processor_kwargs on the context in this test is unused, since
-    # this is testing the mapper directly. In practice, the processor kwargs
-    # are wrapped in a closure when calling the max tokens func. We explicitly
-    # do NOT use the mm_processor_kwargs in the model context here to ensure
-    # that the max image tokens implementation is referencing a mix of the
-    # kwargs to the function and the original mm_processor_kwargs in case
-    # values are somehow updated and end up in a bad state.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-    actual_max_tokens = get_max_phi3v_image_tokens(
-        InputContext(ctx.model_config),
-        num_crops=num_crops,
-    )
-    assert expected_max_tokens == actual_max_tokens
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "num_crops,expected_toks_per_img",
-    [
-        (4, 757),
-        (16, 1921),
-        # the default num_crops of phi-3.5-vision is 4
-        (None, 757),
-    ])
-@pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
-                            model: str, num_crops: Optional[int],
-                            expected_toks_per_img: int, num_imgs: int):
-    """Ensure input_processor_for_phi3v handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    # Build the image str / prompt based on the number of images we pass
-    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
-    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
-    images = [image_assets[0].pil_image] * num_imgs
-    mm_data = {"image": images}
-    mm_processor_kwargs = {}
-    if num_crops is not None:
-        mm_processor_kwargs = {"num_crops": num_crops}
-    processor = processor_for_phi3v(ctx)
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
-    # Ensure we have the right number of placeholders per num_crops size
-    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
-    assert img_tok_count == expected_toks_per_img * num_imgs
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
-"""Tests for Qwen's multimodal preprocessing kwargs."""
-from typing import Dict, List, Union
-import os
-import pytest
-import torch
-from PIL.Image import Image
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-from .....conftest import IMAGE_ASSETS
-from ....utils import build_model_context
-from .....utils import models_path_prefix
-### Multimodal preprocessing tests
-SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
-# These values are specific to Qwen-VL/Chat; we can get these from the model
-# config also, but they are hardcoded here to keep the parameterize/fixtures
-# easy to read.
-IMG_START_ID = 151857
-IMG_END_ID = 151858
-IMG_PAD_ID = 151859
-TOKS_PER_IMG = 256
-VIS_ENC_DIM = 4096
-IMG_SIZE = 448
-@pytest.fixture()
-def input_mapper_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_mapper_for_qwen
-    return input_mapper_for_qwen
-@pytest.fixture()
-def input_processor_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_processor_for_qwen
-    return input_processor_for_qwen
-@pytest.fixture()
-def qwen_vl_context() -> InputContext:
-    """Get an InputContext for Qwen-VL."""
-    return build_model_context(model_name=os.path.join(models_path_prefix, "Qwen/Qwen-VL"),
-                               trust_remote_code=True)
-# Happy path tests for single/multi-image scenarios for the multimodal
-# input processor and mapper, respectively
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_input_processor_valid_mm_data(input_processor_for_qwen,
-                                       qwen_vl_context: InputContext,
-                                       num_images: int):
-    """Happy cases for image inputs to Qwen's multimodal input processor."""
-    prompt = "".join(
-        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
-    inputs = token_inputs(
-        prompt=prompt,
-        # When processing multimodal data for a multimodal model, the qwen
-        # input processor will overwrite the provided prompt_token_ids with
-        # the image prompts
-        prompt_token_ids=[],
-        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
-    )
-    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
-    assert isinstance(proc_inputs, dict)
-    # Each image should have one start / stop and a fixed context of 256
-    proc_tokens = proc_inputs["prompt_token_ids"]
-    assert proc_tokens.count(IMG_START_ID) == num_images
-    assert proc_tokens.count(IMG_END_ID) == num_images
-    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
-@pytest.mark.parametrize(
-    "img_data,expected_shape",
-    [
-        # single / multi-image
-        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
-        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
-        # single / multi-image embeddings
-        (torch.rand(
-            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
-    ])
-def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
-                                    qwen_vl_context: InputContext,
-                                    img_data: Union[torch.Tensor, List[Image],
-                                                    Image],
-                                    expected_shape: List[int]):
-    """Happy cases for image inputs to Qwen's multimodal input mapper."""
-    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
-    # Ensure that we get the appropriately shaped pixel_values
-    # for images and image embeddings, respectively.
-    assert isinstance(mapped_img_data, MultiModalKwargs)
-    assert "pixel_values" in mapped_img_data
-    assert mapped_img_data["pixel_values"].shape == expected_shape
-# Sad path tests for the multimodal input processor and mapper, respectively
-@pytest.mark.parametrize("mm_data", [
-    {
-        "image": torch.rand(5)
-    },
-    {
-        "image": torch.rand((5, 5, 5, 5, 5))
-    },
-])
-def test_input_processor_invalid_mm_data(input_processor_for_qwen,
-                                         qwen_vl_context: InputContext,
-                                         mm_data: Dict[str, torch.Tensor]):
-    """Test sad cases validated in Qwen's multimodal input processor."""
-    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
-                                     trust_remote_code=True)
-    prompt = "Picture 1: <img></img>\n"
-    prompt_token_ids = tokenizer.encode(prompt)
-    inputs = token_inputs(prompt=prompt,
-                          prompt_token_ids=prompt_token_ids,
-                          multi_modal_data=mm_data)
-    # Should fail since we have too many or too few dimensions for embeddings
-    with pytest.raises(ValueError):
-        input_processor_for_qwen(qwen_vl_context, inputs)
-@pytest.mark.parametrize(
-    "img_data",
-    [
-        # Wrong context length
-        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
-        # Wrong visual encoder output size
-        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
-    ])
-def test_input_mapper_invalid_mm_data(
-    input_mapper_for_qwen,
-    qwen_vl_context: InputContext,
-    img_data: Union[torch.Tensor, List[Image], Image],
-):
-    """Sad cases validated in Qwen VL's multimodal input mapper."""
-    with pytest.raises(ValueError):
-        input_mapper_for_qwen(qwen_vl_context, img_data)
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
-from typing import Any, Dict, Tuple
-import os
-import pytest
-from transformers import AutoTokenizer
-from vllm.inputs import InputContext, InputProcessingContext
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
-from .....utils import models_path_prefix
-MODEL = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
-MIN_PIXELS = "min_pixels"
-MAX_PIXELS = "max_pixels"
-# Fixtures lazy import to avoid initializing CUDA during test collection
-# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
-# input mappers.
-@pytest.fixture()
-def processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
-    return Qwen2VLMultiModalProcessor
-@pytest.fixture()
-def get_max_qwen2_vl_image_tokens():
-    from vllm.model_executor.models.qwen2_vl import (
-        get_max_qwen2_vl_image_tokens)
-    return get_max_qwen2_vl_image_tokens
-@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
-    ({}, 1225),
-    ({
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 324),
-])
-@pytest.mark.parametrize("model", [MODEL])
-def test_qwen2_vl_max_image_tokens(
-    get_max_qwen2_vl_image_tokens,
-    model: str,
-    mm_processor_kwargs: Dict[str, Any],
-    expected_max_tokens: int,
-):
-    """Ensure that the max token calc handles min/max pixels properly."""
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        mm_processor_kwargs=None,
-    )
-    actual_max_tokens = get_max_qwen2_vl_image_tokens(
-        InputContext(ctx.model_config), **mm_processor_kwargs)
-    assert actual_max_tokens == expected_max_tokens
-@pytest.mark.parametrize(
-    "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
-        ({}, 1426, (5704, 1176)),
-        ({
-            MIN_PIXELS: 64**2,
-            MAX_PIXELS: 512**2
-        }, 330, (1320, 1176)),
-    ])
-@pytest.mark.parametrize("model", [MODEL])
-@pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_override(
-    processor_for_qwen2_vl,
-    image_assets: _ImageAssets,
-    model: str,
-    mm_processor_kwargs: Dict[str, Any],
-    expected_toks_per_img: int,
-    expected_pixels_shape: Tuple[int, int],
-    num_imgs: int,
-):
-    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        mm_processor_kwargs=None,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
-    images = [image_assets[0].pil_image] * num_imgs
-    mm_data = {"image": images}
-    processor = processor_for_qwen2_vl(ctx)
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
-    # Ensure we have the right number of placeholders per num_crops size
-    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
-    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
-    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
-    assert img_tok_count == expected_toks_per_img * num_imgs
-    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
-    assert pixel_shape[1] == expected_pixels_shape[1]
--- a/tests/models/decoder_only/vision_language/test_glm4.py
+++ b/tests/models/decoder_only/vision_language/test_glm4.py
-from typing import List, Optional, Tuple, Type
-import pytest
-import os
-from vllm.multimodal.utils import rescale_image_size
-from vllm.transformers_utils.tokenizer import patch_padding_side
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from ....utils import large_gpu_test
-from ...utils import check_logprobs_close
-from ....utils import models_path_prefix
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "What's the content of the image?",
-    "cherry_blossom":
-    "What is the season?",
-})
-models = [os.path.join(models_path_prefix, "THUDM/glm-4v-9b")]
-target_dtype = "bfloat16"
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     max_model_len=2048,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        stop_token_ids = [151329, 151336, 151338]
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                stop_token_ids=stop_token_ids)
-            for prompts, images in inputs
-        ]
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_processor = hf_model.processor
-        patch_padding_side(hf_processor)
-        def processor(*args, text="", images=None, **kwargs):
-            if images is None:
-                return hf_processor(*args, **kwargs)
-            return hf_processor.apply_chat_template(
-                [{
-                    "role": "user",
-                    "image": images,
-                    "content": text
-                }],
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
-                **kwargs,
-            )
-        hf_model.processor = processor
-        hf_model.model.get_output_embeddings = lambda: \
-            hf_model.model.transformer.output_layer
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-            ) for prompts, images in inputs
-        ]
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-@large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
\ No newline at end of file
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
-from typing import Optional, Tuple
-import os
-import pytest
-import torch
-from PIL.Image import Image
-from transformers import AutoConfig
-# Import the functions to test
-from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
-                                              image_to_pixel_values_wrapper)
-from vllm.multimodal.image import rescale_image_size
-from ....utils import models_path_prefix
-models = [
-    os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),  # Replace with your actual model names
-    os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b"),
-]
-def run_preprocessing_test(
-    image: Image,
-    config,
-    max_dynamic_patch: Optional[int] = None,
-) -> Tuple[torch.Tensor, int]:
-    """Test the image preprocessing and calculate expected blocks."""
-    if max_dynamic_patch is None:
-        max_dynamic_patch = config.max_dynamic_patch
-    width, height = image.size
-    use_MSAC = config.use_msac
-    # Create the mapper function with the provided configuration
-    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
-    pixel_values = mapper(image)
-    # Calculate the expected number of blocks
-    if use_MSAC:
-        # First pass
-        blocks1, _, _, aspect_ratio = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,  # Thumbnail is handled separately
-            prior_aspect_ratio=None,
-        )
-        # Second pass
-        blocks2, _, _, _ = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,
-            prior_aspect_ratio=aspect_ratio,
-        )
-        # Add thumbnail if use_thumbnail is True and total_blocks > 1
-        if config.use_thumbnail:
-            blocks1 += 1 if blocks1 > 1 else 0
-            blocks2 += 1 if blocks2 > 1 else 0
-        # Total blocks is the sum of blocks from both passes minus overlapping
-        total_blocks = blocks1 + blocks2 - 1
-        expected_blocks = total_blocks
-    else:
-        blocks, _, _, _ = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,
-            prior_aspect_ratio=None,
-        )
-        expected_blocks = blocks
-        if config.use_thumbnail and expected_blocks > 1:
-            expected_blocks += 1
-    return pixel_values, expected_blocks
-@pytest.mark.parametrize("model_name", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
-def test_image_preprocessing(image_assets, model_name, size_factors,
-                             max_dynamic_patch):
-    """Test image preprocessing pipeline with different configurations."""
-    # Load the configuration from the model
-    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-    for asset in image_assets:
-        image = asset.pil_image
-        for factor in size_factors:
-            scaled_image = rescale_image_size(image, factor)
-            # Test preprocessing and get expected number of blocks
-            pixel_values, expected_blocks = run_preprocessing_test(
-                scaled_image, config, max_dynamic_patch)
-            # Verify output shapes and properties
-            actual_blocks = pixel_values.shape[0]
-            assert actual_blocks == expected_blocks, (
-                f"Expected {expected_blocks} blocks, got {actual_blocks}")
-            # Check image dimensions
-            expected_size = (
-                3,  # Number of channels (C, H, W)
-                config.vision_config.image_size,
-                config.vision_config.image_size,
-            )
-            for img in pixel_values:
-                assert img.shape == expected_size, (
-                    f"Expected image size {expected_size}, got {img.shape}")
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
-# SPDX-License-Identifier: Apache-2.0
-import os
-import re
-from typing import Optional
-import pytest
-from transformers import AutoTokenizer
-from vllm.multimodal.image import rescale_image_size
-from vllm.platforms import current_platform
-from vllm.sequence import SampleLogprobs
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from ...utils import check_logprobs_close
-from ....utils import models_path_prefix
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
-models = [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")]
-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    _, output_str, out_logprobs = vllm_output
-    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
-    assert output_str_without_image[0] == " "
-    output_str_without_image = output_str_without_image[1:]
-    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    hf_output_ids = tokenizer.encode(output_str_without_image)
-    assert hf_output_ids[0] == 1
-    hf_output_ids = hf_output_ids[1:]
-    return hf_output_ids, hf_output_str, out_logprobs
-target_dtype = "half"
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: list[tuple[list[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # HACK - this is an attempted workaround for the following bug
-    # https://github.com/huggingface/transformers/issues/34307
-    from transformers import AutoImageProcessor  # noqa: F401
-    from transformers import AutoProcessor  # noqa: F401
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     task="generate",
-                     max_model_len=4096,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
-    hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-        eos_token_id = hf_model.processor.tokenizer.eos_token_id
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    eos_token_id=eos_token_id)
-            for prompts, images in inputs
-        ]
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-# Since we use _attn_implementation="eager" for hf_runner, there is more
-# significant numerical difference. The basic `logprobs=5` fails to pass.
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
-                         dtype) -> None:
-    images = [asset.pil_image for asset in image_assets]
-    inputs_regresion_7840 = [
-        ([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
-    ]
-    # Regression test for #7840.
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_regresion_7840,
-        model,
-        dtype=dtype,
-        max_tokens=128,
-        num_logprobs=10,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_tokens: int,
-                             num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-    inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -48,7 +48,7 @@ def test_models(
    monkeypatch,
 ) -> None:
-    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
+    if model == os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2") and current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
        # switch to use ROCm CK FA backend
        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")

--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -12,12 +12,12 @@ import torch.nn.functional as F
 from ....utils import models_path_prefix
 MODELS = [
-    os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"),  # Bert
+    # os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"),  # Bert
    os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),  # Roberta
 ]
 EMBEDDING_MODELS = [
-    "sentence-transformers/all-MiniLM-L12-v2",
+    # os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2"),
 ]
 TEXTS_1 = [

--- a/tests/models/embedding/language/test_snowflake_arctic_embed.py
+++ b/tests/models/embedding/language/test_snowflake_arctic_embed.py
@@ -15,10 +15,10 @@ EMBEDDING_PROMPTS = [
 ]
 MODELS = [
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
+    # EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
-                   is_matryoshka=False,
+    #                is_matryoshka=False,
-                   architecture="BertModel",
+    #                architecture="BertModel",
-                   enable_test=True),
+    #                enable_test=True),
    EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
                   is_matryoshka=False,
                   architecture="BertModel",
@@ -43,10 +43,10 @@ MODELS = [
                   is_matryoshka=True,
                   architecture="XLMRobertaModel",
                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+    # EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
-                   is_matryoshka=True,
+    #                is_matryoshka=True,
-                   architecture="GteModel",
+    #                architecture="GteModel",
-                   enable_test=True),
+    #                enable_test=True),
 ]

--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
@@ -5,12 +5,13 @@ Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
 """
 from typing import Optional
+import os
 import pytest
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
-from ....utils import create_new_process_for_each_test, multi_gpu_test
+from ....utils import create_new_process_for_each_test, multi_gpu_test, models_path_prefix
 PROMPTS = [
    {
@@ -33,7 +34,7 @@ PROMPTS = [
 ]
 EXPECTED = {
-    "openai/whisper-tiny": [
+    os.path.join(models_path_prefix, "openai/whisper-tiny"): [
        " He has birth words I spoke in the original corner of that. And a"
        " little piece of black coat poetry. Mary had a little sandwich,"
        " sweet, with white and snow. And everyone had it very went the last"
@@ -45,7 +46,7 @@ EXPECTED = {
        " American League Championship. I don't believe it. It just continues"
        " by all five."
    ],
-    "openai/whisper-small": [
+    os.path.join(models_path_prefix, "openai/whisper-small"): [
        " The first words I spoke in the original pornograph. A little piece"
        " of practical poetry. Mary had a little lamb, its fleece was quite a"
        " slow, and everywhere that Mary went the lamb was sure to go.",
@@ -55,7 +56,7 @@ EXPECTED = {
        " play for the American League Championship. I don't believe it. It"
        " just continues. My, oh my."
    ],
-    "openai/whisper-medium": [
+    os.path.join(models_path_prefix, "openai/whisper-medium"): [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its fleece was quite as"
        " slow, and everywhere that Mary went the lamb was sure to go.",
@@ -66,7 +67,7 @@ EXPECTED = {
        " League Championship. I don't believe it. It just continues. My, oh"
        " my."
    ],
-    "openai/whisper-large-v3": [
+    os.path.join(models_path_prefix, "openai/whisper-large-v3"): [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its feet were quite as"
        " slow, and everywhere that Mary went, the lamb was sure to go.",
@@ -77,7 +78,7 @@ EXPECTED = {
        " League Championship. I don't believe it. It just continues. My, oh,"
        " my."
    ],
-    "openai/whisper-large-v3-turbo": [
+    os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo"): [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its streets were quite"
        " as slow, and everywhere that Mary went the lamb was sure to go.",
@@ -122,14 +123,14 @@ def run_test(
 @create_new_process_for_each_test()
 @pytest.mark.core_model
 @pytest.mark.parametrize(
-    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
+    "model", [os.path.join(models_path_prefix, "openai/whisper-small"), os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo")])
 def test_models(model) -> None:
    run_test(model, tensor_parallel_size=1)
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.core_model
-@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo")])
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 def test_models_distributed(model, distributed_executor_backend) -> None:
    run_test(model,

--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -179,7 +179,8 @@ def run_test(
        pytest.param(os.path.join(models_path_prefix, "facebook/bart-large-cnn")),
    ],
 )
-@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+# @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@@ -201,7 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/bart-large-cnn")])
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])

--- a/tests/models/encoder_decoder/vision_language/test_broadcast.py
+++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py
@@ -20,7 +20,7 @@ def test_models(hf_runner, vllm_runner, image_assets,
    num_logprobs = 5
    tensor_parallel_size = 2
-    if "meta-llama/Llama-3.2-11B-Vision-Instruct" in model:
+    if os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct") in model:
    # if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
        from .test_mllama import models, run_test
    else:

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -4,6 +4,7 @@ from functools import partial
 from typing import Optional, Union
 import numpy as np
+import os
 import pytest
 from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
                                                       UserMessage)
@@ -21,6 +22,7 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer,
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
+from ....utils import models_path_prefix
 def _test_processing_correctness(
@@ -245,48 +247,48 @@ def _test_processing_correctness_mistral(
 # yapf: disable
 @pytest.mark.parametrize("model_id", [
-    "rhymes-ai/Aria",
+    os.path.join(models_path_prefix, "rhymes-ai/Aria"),
-    "CohereForAI/aya-vision-8b",
+    os.path.join(models_path_prefix, "CohereForAI/aya-vision-8b"),
-    "Salesforce/blip2-opt-2.7b",
+    os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b"),
-    "facebook/chameleon-7b",
+    os.path.join(models_path_prefix, "facebook/chameleon-7b"),
-    "deepseek-ai/deepseek-vl2-tiny",
+    os.path.join(models_path_prefix, "deepseek-ai/deepseek-vl2-tiny"),
-    "microsoft/Florence-2-base",
+    os.path.join(models_path_prefix, "microsoft/Florence-2-base"),
-    "adept/fuyu-8b",
+    os.path.join(models_path_prefix, "adept/fuyu-8b"),
-    "google/gemma-3-4b-it",
+    os.path.join(models_path_prefix, "google/gemma-3-4b-it"),
-    "THUDM/glm-4v-9b",
+    os.path.join(models_path_prefix, "THUDM/glm-4v-9b"),
-    "ibm-granite/granite-speech-3.3-8b",
+    os.path.join(models_path_prefix, "ibm-granite/granite-speech-3.3-8b"),
-    "h2oai/h2ovl-mississippi-800m",
+    os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
-    "OpenGVLab/InternVL2-1B",
+    os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
-    "HuggingFaceM4/Idefics3-8B-Llama3",
+    os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"),
-    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM2-2.2B-Instruct"),
-    "moonshotai/Kimi-VL-A3B-Instruct",
+    os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct"),
-    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
-    "llava-hf/llava-1.5-7b-hf",
+    os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
-    "llava-hf/llava-v1.6-mistral-7b-hf",
+    os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"),
-    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf"),
-    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct"),
-    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3"),
-    "openbmb/MiniCPM-Llama3-V-2_5",
+    os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5"),
-    "openbmb/MiniCPM-o-2_6",
+    os.path.join(models_path_prefix, "openbmb/MiniCPM-o-2_6"),
-    "openbmb/MiniCPM-V-2_6",
+    os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6"),
-    "allenai/Molmo-7B-D-0924",
+    os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"),
-    "allenai/Molmo-7B-O-0924",
+    os.path.join(models_path_prefix, "allenai/Molmo-7B-O-0924"),
-    "nvidia/NVLM-D-72B",
+    os.path.join(models_path_prefix, "nvidia/NVLM-D-72B"),
-    "google/paligemma-3b-mix-224",
+    os.path.join(models_path_prefix, "google/paligemma-3b-mix-224"),
-    "google/paligemma2-3b-ft-docci-448",
+    os.path.join(models_path_prefix, "google/paligemma2-3b-ft-docci-448"),
-    "microsoft/Phi-4-multimodal-instruct",
+    os.path.join(models_path_prefix, "microsoft/Phi-4-multimodal-instruct"),
-    "mistralai/Pixtral-12B-2409",
+    os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"),
-    "mistral-community/pixtral-12b",
+    os.path.join(models_path_prefix, "mistral-community/pixtral-12b"),
-    "Qwen/Qwen-VL-Chat",
+    os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"),
-    "Qwen/Qwen2-VL-2B-Instruct",
+    os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"),
-    "Qwen/Qwen2.5-VL-3B-Instruct",
+    os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"),
-    "Qwen/Qwen2-Audio-7B-Instruct",
+    os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct"),
-    "Qwen/Qwen2.5-Omni-7B",
+    os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-7B"),
-    "Skywork/Skywork-R1V-38B",
+    os.path.join(models_path_prefix, "Skywork/Skywork-R1V-38B"),
-    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"),
-    "openai/whisper-large-v3",
+    os.path.join(models_path_prefix, "openai/whisper-large-v3"),
-    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -315,7 +317,7 @@ def test_processing_correctness(
 # yapf: disable
-@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])

--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -75,7 +75,7 @@ def _run_check(
    assert pixel_shape[0] == total_expected_num_patches
-@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")])
+@pytest.mark.parametrize("model_id", os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"))
 @pytest.mark.parametrize(
    "size_factors",
    [

--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for Llama4's multimodal preprocessing kwargs."""
+import os
 import pytest
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -8,10 +9,11 @@ from vllm.transformers_utils.tokenizer import encode_tokens
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
+from ....utils import models_path_prefix
 @pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+                         [os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct")])
 @pytest.mark.parametrize("mm_processor_kwargs", [{}])
 @pytest.mark.parametrize("num_imgs", [1, 5])
 @pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])

--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -3,6 +3,7 @@
 import itertools
 from functools import partial
+import os
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
@@ -12,6 +13,7 @@ from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from ...utils import build_model_context
+from ....utils import models_path_prefix
 def _validate_image_max_tokens_one(
@@ -32,7 +34,7 @@ def _validate_image_max_tokens_one(
 @pytest.mark.skip("This test takes around 5 minutes to run. "
                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
 def test_processor_max_tokens(model_id):
    ctx = build_model_context(
        model_id,
@@ -127,7 +129,7 @@ def _test_image_prompt_replacements(
        raise AssertionError(msg)
-@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
    ctx = build_model_context(
@@ -153,7 +155,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
 @pytest.mark.skip("This test takes around 2 hours to run. "
                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
    ctx = build_model_context(