Commit 04629132 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] fix tests

parent 07c69390
import os
import pytest
from vllm.inputs import InputContext
from ....utils import build_model_context
from .....utils import models_path_prefix
@pytest.fixture()
def get_max_llava_next_image_tokens():
from vllm.model_executor.models.llava_next import (
get_max_llava_next_image_tokens)
return get_max_llava_next_image_tokens
@pytest.fixture()
def dummy_data_for_llava_next():
from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
return dummy_data_for_llava_next
@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
([[336, 336]], 1176),
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
])
def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
get_max_llava_next_image_tokens):
ctx = build_model_context(model_name=os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"))
# Update the config image_grid_pinpoints
# and calculate the resulting max tokens
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
actual_max_tokens = get_max_llava_next_image_tokens(
InputContext(ctx.model_config))
assert expected_max_tokens == actual_max_tokens
@pytest.mark.parametrize(
"gridpoints,expected_size",
[
# One point; it has to be the largest
([[336, 336]], (336, 336)),
# Default for most llava next models; the 2x2 tile is the largest
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
(672, 672)),
# If two rectangular gridpoints are the same, the more vertical
# one has the higher feature count due to newline features
([[336, 672], [672, 336]], (672, 336))
])
def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
gridpoints, expected_size):
ctx = build_model_context(model_name=os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"))
# Update the config image_grid_pinpoints
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
seq_len = 5000 # bigger than the max feature size for any image
dummy_data = dummy_data_for_llava_next(
ctx,
seq_len=seq_len,
mm_counts={"image": 1},
)
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data
# The dummy data dims should match the gridpoint with the biggest feat size
assert mm_data["image"].height == expected_size[0]
assert mm_data["image"].width == expected_size[1]
assert len(seq_data.get_token_ids()) >= seq_len
"""Tests for phi3v's multimodal preprocessing kwargs."""
from typing import Optional
import os
import pytest
from transformers import AutoTokenizer
from vllm.inputs import InputContext, InputProcessingContext
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
from .....conftest import _ImageAssets
from ....utils import build_model_context
from .....utils import models_path_prefix
models = [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")]
# Wrap lazy imports to avoid initializing CUDA during test collection
@pytest.fixture()
def processor_for_phi3v():
from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
return Phi3VMultiModalProcessor
@pytest.fixture()
def get_max_phi3v_image_tokens():
from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
return get_max_phi3v_image_tokens
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("num_crops,expected_max_tokens", [
(4, 781),
(16, 2653),
])
def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
num_crops: int, expected_max_tokens: int):
"""Ensure get_max_phi3v_image_tokens handles num_crops properly."""
# NOTE: mm_processor_kwargs on the context in this test is unused, since
# this is testing the mapper directly. In practice, the processor kwargs
# are wrapped in a closure when calling the max tokens func. We explicitly
# do NOT use the mm_processor_kwargs in the model context here to ensure
# that the max image tokens implementation is referencing a mix of the
# kwargs to the function and the original mm_processor_kwargs in case
# values are somehow updated and end up in a bad state.
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=None,
)
actual_max_tokens = get_max_phi3v_image_tokens(
InputContext(ctx.model_config),
num_crops=num_crops,
)
assert expected_max_tokens == actual_max_tokens
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"num_crops,expected_toks_per_img",
[
(4, 757),
(16, 1921),
# the default num_crops of phi-3.5-vision is 4
(None, 757),
])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
model: str, num_crops: Optional[int],
expected_toks_per_img: int, num_imgs: int):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer)
# Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
images = [image_assets[0].pil_image] * num_imgs
mm_data = {"image": images}
mm_processor_kwargs = {}
if num_crops is not None:
mm_processor_kwargs = {"num_crops": num_crops}
processor = processor_for_phi3v(ctx)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
assert img_tok_count == expected_toks_per_img * num_imgs
"""Tests for Qwen's multimodal preprocessing kwargs."""
from typing import Dict, List, Union
import os
import pytest
import torch
from PIL.Image import Image
from vllm.inputs import InputContext, token_inputs
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from .....conftest import IMAGE_ASSETS
from ....utils import build_model_context
from .....utils import models_path_prefix
### Multimodal preprocessing tests
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
# These values are specific to Qwen-VL/Chat; we can get these from the model
# config also, but they are hardcoded here to keep the parameterize/fixtures
# easy to read.
IMG_START_ID = 151857
IMG_END_ID = 151858
IMG_PAD_ID = 151859
TOKS_PER_IMG = 256
VIS_ENC_DIM = 4096
IMG_SIZE = 448
@pytest.fixture()
def input_mapper_for_qwen():
# Lazy import to avoid initializing CUDA during test collection
from vllm.model_executor.models.qwen import input_mapper_for_qwen
return input_mapper_for_qwen
@pytest.fixture()
def input_processor_for_qwen():
# Lazy import to avoid initializing CUDA during test collection
from vllm.model_executor.models.qwen import input_processor_for_qwen
return input_processor_for_qwen
@pytest.fixture()
def qwen_vl_context() -> InputContext:
"""Get an InputContext for Qwen-VL."""
return build_model_context(model_name=os.path.join(models_path_prefix, "Qwen/Qwen-VL"),
trust_remote_code=True)
# Happy path tests for single/multi-image scenarios for the multimodal
# input processor and mapper, respectively
@pytest.mark.parametrize("num_images", [1, 2])
def test_input_processor_valid_mm_data(input_processor_for_qwen,
qwen_vl_context: InputContext,
num_images: int):
"""Happy cases for image inputs to Qwen's multimodal input processor."""
prompt = "".join(
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
inputs = token_inputs(
prompt=prompt,
# When processing multimodal data for a multimodal model, the qwen
# input processor will overwrite the provided prompt_token_ids with
# the image prompts
prompt_token_ids=[],
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
)
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
assert isinstance(proc_inputs, dict)
# Each image should have one start / stop and a fixed context of 256
proc_tokens = proc_inputs["prompt_token_ids"]
assert proc_tokens.count(IMG_START_ID) == num_images
assert proc_tokens.count(IMG_END_ID) == num_images
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
@pytest.mark.parametrize(
"img_data,expected_shape",
[
# single / multi-image
(SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
(2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
# single / multi-image embeddings
(torch.rand(
(TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
(torch.rand(
(1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
(torch.rand(
(2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
])
def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
qwen_vl_context: InputContext,
img_data: Union[torch.Tensor, List[Image],
Image],
expected_shape: List[int]):
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
# Ensure that we get the appropriately shaped pixel_values
# for images and image embeddings, respectively.
assert isinstance(mapped_img_data, MultiModalKwargs)
assert "pixel_values" in mapped_img_data
assert mapped_img_data["pixel_values"].shape == expected_shape
# Sad path tests for the multimodal input processor and mapper, respectively
@pytest.mark.parametrize("mm_data", [
{
"image": torch.rand(5)
},
{
"image": torch.rand((5, 5, 5, 5, 5))
},
])
def test_input_processor_invalid_mm_data(input_processor_for_qwen,
qwen_vl_context: InputContext,
mm_data: Dict[str, torch.Tensor]):
"""Test sad cases validated in Qwen's multimodal input processor."""
tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
trust_remote_code=True)
prompt = "Picture 1: <img></img>\n"
prompt_token_ids = tokenizer.encode(prompt)
inputs = token_inputs(prompt=prompt,
prompt_token_ids=prompt_token_ids,
multi_modal_data=mm_data)
# Should fail since we have too many or too few dimensions for embeddings
with pytest.raises(ValueError):
input_processor_for_qwen(qwen_vl_context, inputs)
@pytest.mark.parametrize(
"img_data",
[
# Wrong context length
torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
# Wrong visual encoder output size
torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
])
def test_input_mapper_invalid_mm_data(
input_mapper_for_qwen,
qwen_vl_context: InputContext,
img_data: Union[torch.Tensor, List[Image], Image],
):
"""Sad cases validated in Qwen VL's multimodal input mapper."""
with pytest.raises(ValueError):
input_mapper_for_qwen(qwen_vl_context, img_data)
from typing import Any, Dict, Tuple
import os
import pytest
from transformers import AutoTokenizer
from vllm.inputs import InputContext, InputProcessingContext
from .....conftest import _ImageAssets
from ....utils import build_model_context
from .....utils import models_path_prefix
MODEL = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
MIN_PIXELS = "min_pixels"
MAX_PIXELS = "max_pixels"
# Fixtures lazy import to avoid initializing CUDA during test collection
# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
# input mappers.
@pytest.fixture()
def processor_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
return Qwen2VLMultiModalProcessor
@pytest.fixture()
def get_max_qwen2_vl_image_tokens():
from vllm.model_executor.models.qwen2_vl import (
get_max_qwen2_vl_image_tokens)
return get_max_qwen2_vl_image_tokens
@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
({}, 1225),
({
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, 324),
])
@pytest.mark.parametrize("model", [MODEL])
def test_qwen2_vl_max_image_tokens(
get_max_qwen2_vl_image_tokens,
model: str,
mm_processor_kwargs: Dict[str, Any],
expected_max_tokens: int,
):
"""Ensure that the max token calc handles min/max pixels properly."""
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
mm_processor_kwargs=None,
)
actual_max_tokens = get_max_qwen2_vl_image_tokens(
InputContext(ctx.model_config), **mm_processor_kwargs)
assert actual_max_tokens == expected_max_tokens
@pytest.mark.parametrize(
"mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
({}, 1426, (5704, 1176)),
({
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, 330, (1320, 1176)),
])
@pytest.mark.parametrize("model", [MODEL])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(
processor_for_qwen2_vl,
image_assets: _ImageAssets,
model: str,
mm_processor_kwargs: Dict[str, Any],
expected_toks_per_img: int,
expected_pixels_shape: Tuple[int, int],
num_imgs: int,
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
mm_processor_kwargs=None,
)
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer)
# Build the image str / prompt based on the number of images we pass
prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
images = [image_assets[0].pil_image] * num_imgs
mm_data = {"image": images}
processor = processor_for_qwen2_vl(ctx)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
assert img_tok_count == expected_toks_per_img * num_imgs
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
assert pixel_shape[1] == expected_pixels_shape[1]
from typing import List, Optional, Tuple, Type
import pytest
import os
from vllm.multimodal.utils import rescale_image_size
from vllm.transformers_utils.tokenizer import patch_padding_side
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"What's the content of the image?",
"cherry_blossom":
"What is the season?",
})
models = [os.path.join(models_path_prefix, "THUDM/glm-4v-9b")]
target_dtype = "bfloat16"
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
inputs: List[Tuple[List[str], PromptImageInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
max_model_len=2048,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
stop_token_ids = [151329, 151336, 151338]
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
stop_token_ids=stop_token_ids)
for prompts, images in inputs
]
with hf_runner(model, dtype=dtype) as hf_model:
hf_processor = hf_model.processor
patch_padding_side(hf_processor)
def processor(*args, text="", images=None, **kwargs):
if images is None:
return hf_processor(*args, **kwargs)
return hf_processor.apply_chat_template(
[{
"role": "user",
"image": images,
"content": text
}],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
**kwargs,
)
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
) for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
\ No newline at end of file
from typing import Optional, Tuple
import os
import pytest
import torch
from PIL.Image import Image
from transformers import AutoConfig
# Import the functions to test
from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
image_to_pixel_values_wrapper)
from vllm.multimodal.image import rescale_image_size
from ....utils import models_path_prefix
models = [
os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"), # Replace with your actual model names
os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b"),
]
def run_preprocessing_test(
image: Image,
config,
max_dynamic_patch: Optional[int] = None,
) -> Tuple[torch.Tensor, int]:
"""Test the image preprocessing and calculate expected blocks."""
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
width, height = image.size
use_MSAC = config.use_msac
# Create the mapper function with the provided configuration
mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
pixel_values = mapper(image)
# Calculate the expected number of blocks
if use_MSAC:
# First pass
blocks1, _, _, aspect_ratio = calculate_num_blocks(
width,
height,
config.min_dynamic_patch,
max_dynamic_patch,
config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
prior_aspect_ratio=None,
)
# Second pass
blocks2, _, _, _ = calculate_num_blocks(
width,
height,
config.min_dynamic_patch,
max_dynamic_patch,
config.vision_config.image_size,
use_thumbnail=False,
prior_aspect_ratio=aspect_ratio,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus overlapping
total_blocks = blocks1 + blocks2 - 1
expected_blocks = total_blocks
else:
blocks, _, _, _ = calculate_num_blocks(
width,
height,
config.min_dynamic_patch,
max_dynamic_patch,
config.vision_config.image_size,
use_thumbnail=False,
prior_aspect_ratio=None,
)
expected_blocks = blocks
if config.use_thumbnail and expected_blocks > 1:
expected_blocks += 1
return pixel_values, expected_blocks
@pytest.mark.parametrize("model_name", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
def test_image_preprocessing(image_assets, model_name, size_factors,
max_dynamic_patch):
"""Test image preprocessing pipeline with different configurations."""
# Load the configuration from the model
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
for asset in image_assets:
image = asset.pil_image
for factor in size_factors:
scaled_image = rescale_image_size(image, factor)
# Test preprocessing and get expected number of blocks
pixel_values, expected_blocks = run_preprocessing_test(
scaled_image, config, max_dynamic_patch)
# Verify output shapes and properties
actual_blocks = pixel_values.shape[0]
assert actual_blocks == expected_blocks, (
f"Expected {expected_blocks} blocks, got {actual_blocks}")
# Check image dimensions
expected_size = (
3, # Number of channels (C, H, W)
config.vision_config.image_size,
config.vision_config.image_size,
)
for img in pixel_values:
assert img.shape == expected_size, (
f"Expected image size {expected_size}, got {img.shape}")
# SPDX-License-Identifier: Apache-2.0
import os
import re
from typing import Optional
import pytest
from transformers import AutoTokenizer
from vllm.multimodal.image import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
models = [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")]
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
Optional[SampleLogprobs]],
model: str):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
assert output_str_without_image[0] == " "
output_str_without_image = output_str_without_image[1:]
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model)
hf_output_ids = tokenizer.encode(output_str_without_image)
assert hf_output_ids[0] == 1
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
target_dtype = "half"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: list[tuple[list[str], PromptImageInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from transformers import AutoImageProcessor # noqa: F401
from transformers import AutoProcessor # noqa: F401
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
task="generate",
max_model_len=4096,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
with hf_runner(model, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
eos_token_id = hf_model.processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
eos_token_id=eos_token_id)
for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, model)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
)
# Since we use _attn_implementation="eager" for hf_runner, there is more
# significant numerical difference. The basic `logprobs=5` fails to pass.
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
dtype) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_regresion_7840 = [
([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
# Regression test for #7840.
run_test(
hf_runner,
vllm_runner,
inputs_regresion_7840,
model,
dtype=dtype,
max_tokens=128,
num_logprobs=10,
mm_limit=1,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors])
]
run_test(
hf_runner,
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
...@@ -48,7 +48,7 @@ def test_models( ...@@ -48,7 +48,7 @@ def test_models(
monkeypatch, monkeypatch,
) -> None: ) -> None:
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm(): if model == os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2") and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention # ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend # switch to use ROCm CK FA backend
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False") monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
...@@ -86,4 +86,4 @@ def test_models( ...@@ -86,4 +86,4 @@ def test_models(
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
tol=1e-2, tol=1e-2,
) )
\ No newline at end of file
...@@ -12,12 +12,12 @@ import torch.nn.functional as F ...@@ -12,12 +12,12 @@ import torch.nn.functional as F
from ....utils import models_path_prefix from ....utils import models_path_prefix
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), # Bert # os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), # Bert
os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"), # Roberta os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"), # Roberta
] ]
EMBEDDING_MODELS = [ EMBEDDING_MODELS = [
"sentence-transformers/all-MiniLM-L12-v2", # os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2"),
] ]
TEXTS_1 = [ TEXTS_1 = [
...@@ -189,4 +189,4 @@ def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, ...@@ -189,4 +189,4 @@ def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
assert len(hf_outputs) == 2 assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
\ No newline at end of file
...@@ -15,10 +15,10 @@ EMBEDDING_PROMPTS = [ ...@@ -15,10 +15,10 @@ EMBEDDING_PROMPTS = [
] ]
MODELS = [ MODELS = [
EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", # EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False, # is_matryoshka=False,
architecture="BertModel", # architecture="BertModel",
enable_test=True), # enable_test=True),
EmbedModelInfo("Snowflake/snowflake-arctic-embed-s", EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
...@@ -43,10 +43,10 @@ MODELS = [ ...@@ -43,10 +43,10 @@ MODELS = [
is_matryoshka=True, is_matryoshka=True,
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
enable_test=True), enable_test=True),
EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0", # EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True, # is_matryoshka=True,
architecture="GteModel", # architecture="GteModel",
enable_test=True), # enable_test=True),
] ]
...@@ -98,4 +98,4 @@ def test_models( ...@@ -98,4 +98,4 @@ def test_models(
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
tol=1e-2, tol=1e-2,
) )
\ No newline at end of file
...@@ -5,12 +5,13 @@ Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`. ...@@ -5,12 +5,13 @@ Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
""" """
from typing import Optional from typing import Optional
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ....utils import create_new_process_for_each_test, multi_gpu_test from ....utils import create_new_process_for_each_test, multi_gpu_test, models_path_prefix
PROMPTS = [ PROMPTS = [
{ {
...@@ -33,7 +34,7 @@ PROMPTS = [ ...@@ -33,7 +34,7 @@ PROMPTS = [
] ]
EXPECTED = { EXPECTED = {
"openai/whisper-tiny": [ os.path.join(models_path_prefix, "openai/whisper-tiny"): [
" He has birth words I spoke in the original corner of that. And a" " He has birth words I spoke in the original corner of that. And a"
" little piece of black coat poetry. Mary had a little sandwich," " little piece of black coat poetry. Mary had a little sandwich,"
" sweet, with white and snow. And everyone had it very went the last" " sweet, with white and snow. And everyone had it very went the last"
...@@ -45,7 +46,7 @@ EXPECTED = { ...@@ -45,7 +46,7 @@ EXPECTED = {
" American League Championship. I don't believe it. It just continues" " American League Championship. I don't believe it. It just continues"
" by all five." " by all five."
], ],
"openai/whisper-small": [ os.path.join(models_path_prefix, "openai/whisper-small"): [
" The first words I spoke in the original pornograph. A little piece" " The first words I spoke in the original pornograph. A little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite a" " of practical poetry. Mary had a little lamb, its fleece was quite a"
" slow, and everywhere that Mary went the lamb was sure to go.", " slow, and everywhere that Mary went the lamb was sure to go.",
...@@ -55,7 +56,7 @@ EXPECTED = { ...@@ -55,7 +56,7 @@ EXPECTED = {
" play for the American League Championship. I don't believe it. It" " play for the American League Championship. I don't believe it. It"
" just continues. My, oh my." " just continues. My, oh my."
], ],
"openai/whisper-medium": [ os.path.join(models_path_prefix, "openai/whisper-medium"): [
" The first words I spoke in the original phonograph, a little piece" " The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite as" " of practical poetry. Mary had a little lamb, its fleece was quite as"
" slow, and everywhere that Mary went the lamb was sure to go.", " slow, and everywhere that Mary went the lamb was sure to go.",
...@@ -66,7 +67,7 @@ EXPECTED = { ...@@ -66,7 +67,7 @@ EXPECTED = {
" League Championship. I don't believe it. It just continues. My, oh" " League Championship. I don't believe it. It just continues. My, oh"
" my." " my."
], ],
"openai/whisper-large-v3": [ os.path.join(models_path_prefix, "openai/whisper-large-v3"): [
" The first words I spoke in the original phonograph, a little piece" " The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its feet were quite as" " of practical poetry. Mary had a little lamb, its feet were quite as"
" slow, and everywhere that Mary went, the lamb was sure to go.", " slow, and everywhere that Mary went, the lamb was sure to go.",
...@@ -77,7 +78,7 @@ EXPECTED = { ...@@ -77,7 +78,7 @@ EXPECTED = {
" League Championship. I don't believe it. It just continues. My, oh," " League Championship. I don't believe it. It just continues. My, oh,"
" my." " my."
], ],
"openai/whisper-large-v3-turbo": [ os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo"): [
" The first words I spoke in the original phonograph, a little piece" " The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its streets were quite" " of practical poetry. Mary had a little lamb, its streets were quite"
" as slow, and everywhere that Mary went the lamb was sure to go.", " as slow, and everywhere that Mary went the lamb was sure to go.",
...@@ -122,16 +123,16 @@ def run_test( ...@@ -122,16 +123,16 @@ def run_test(
@create_new_process_for_each_test() @create_new_process_for_each_test()
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) "model", [os.path.join(models_path_prefix, "openai/whisper-small"), os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo")])
def test_models(model) -> None: def test_models(model) -> None:
run_test(model, tensor_parallel_size=1) run_test(model, tensor_parallel_size=1)
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo")])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
def test_models_distributed(model, distributed_executor_backend) -> None: def test_models_distributed(model, distributed_executor_backend) -> None:
run_test(model, run_test(model,
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend) distributed_executor_backend=distributed_executor_backend)
\ No newline at end of file
...@@ -179,7 +179,8 @@ def run_test( ...@@ -179,7 +179,8 @@ def run_test(
pytest.param(os.path.join(models_path_prefix, "facebook/bart-large-cnn")), pytest.param(os.path.join(models_path_prefix, "facebook/bart-large-cnn")),
], ],
) )
@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) # @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
...@@ -201,7 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, ...@@ -201,7 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/bart-large-cnn")])
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
...@@ -222,4 +223,4 @@ def test_models_distributed(hf_runner, vllm_runner, ...@@ -222,4 +223,4 @@ def test_models_distributed(hf_runner, vllm_runner,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
) )
\ No newline at end of file
...@@ -20,7 +20,7 @@ def test_models(hf_runner, vllm_runner, image_assets, ...@@ -20,7 +20,7 @@ def test_models(hf_runner, vllm_runner, image_assets,
num_logprobs = 5 num_logprobs = 5
tensor_parallel_size = 2 tensor_parallel_size = 2
if "meta-llama/Llama-3.2-11B-Vision-Instruct" in model: if os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct") in model:
# if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"): # if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
from .test_mllama import models, run_test from .test_mllama import models, run_test
else: else:
...@@ -37,4 +37,4 @@ def test_models(hf_runner, vllm_runner, image_assets, ...@@ -37,4 +37,4 @@ def test_models(hf_runner, vllm_runner, image_assets,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
) )
\ No newline at end of file
...@@ -4,6 +4,7 @@ from functools import partial ...@@ -4,6 +4,7 @@ from functools import partial
from typing import Optional, Union from typing import Optional, Union
import numpy as np import numpy as np
import os
import pytest import pytest
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage) UserMessage)
...@@ -21,6 +22,7 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer, ...@@ -21,6 +22,7 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer,
from ....multimodal.utils import random_audio, random_image, random_video from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS from ...registry import HF_EXAMPLE_MODELS
from ....utils import models_path_prefix
def _test_processing_correctness( def _test_processing_correctness(
...@@ -245,48 +247,48 @@ def _test_processing_correctness_mistral( ...@@ -245,48 +247,48 @@ def _test_processing_correctness_mistral(
# yapf: disable # yapf: disable
@pytest.mark.parametrize("model_id", [ @pytest.mark.parametrize("model_id", [
"rhymes-ai/Aria", os.path.join(models_path_prefix, "rhymes-ai/Aria"),
"CohereForAI/aya-vision-8b", os.path.join(models_path_prefix, "CohereForAI/aya-vision-8b"),
"Salesforce/blip2-opt-2.7b", os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b"),
"facebook/chameleon-7b", os.path.join(models_path_prefix, "facebook/chameleon-7b"),
"deepseek-ai/deepseek-vl2-tiny", os.path.join(models_path_prefix, "deepseek-ai/deepseek-vl2-tiny"),
"microsoft/Florence-2-base", os.path.join(models_path_prefix, "microsoft/Florence-2-base"),
"adept/fuyu-8b", os.path.join(models_path_prefix, "adept/fuyu-8b"),
"google/gemma-3-4b-it", os.path.join(models_path_prefix, "google/gemma-3-4b-it"),
"THUDM/glm-4v-9b", os.path.join(models_path_prefix, "THUDM/glm-4v-9b"),
"ibm-granite/granite-speech-3.3-8b", os.path.join(models_path_prefix, "ibm-granite/granite-speech-3.3-8b"),
"h2oai/h2ovl-mississippi-800m", os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
"OpenGVLab/InternVL2-1B", os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
"HuggingFaceM4/Idefics3-8B-Llama3", os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"),
"HuggingFaceTB/SmolVLM2-2.2B-Instruct", os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM2-2.2B-Instruct"),
"moonshotai/Kimi-VL-A3B-Instruct", os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct"),
"meta-llama/Llama-4-Scout-17B-16E-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"llava-hf/llava-1.5-7b-hf", os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
"llava-hf/llava-v1.6-mistral-7b-hf", os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"),
"llava-hf/LLaVA-NeXT-Video-7B-hf", os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf"),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf", os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),
"meta-llama/Llama-3.2-11B-Vision-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct"),
"TIGER-Lab/Mantis-8B-siglip-llama3", os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3"),
"openbmb/MiniCPM-Llama3-V-2_5", os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5"),
"openbmb/MiniCPM-o-2_6", os.path.join(models_path_prefix, "openbmb/MiniCPM-o-2_6"),
"openbmb/MiniCPM-V-2_6", os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6"),
"allenai/Molmo-7B-D-0924", os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"),
"allenai/Molmo-7B-O-0924", os.path.join(models_path_prefix, "allenai/Molmo-7B-O-0924"),
"nvidia/NVLM-D-72B", os.path.join(models_path_prefix, "nvidia/NVLM-D-72B"),
"google/paligemma-3b-mix-224", os.path.join(models_path_prefix, "google/paligemma-3b-mix-224"),
"google/paligemma2-3b-ft-docci-448", os.path.join(models_path_prefix, "google/paligemma2-3b-ft-docci-448"),
"microsoft/Phi-4-multimodal-instruct", os.path.join(models_path_prefix, "microsoft/Phi-4-multimodal-instruct"),
"mistralai/Pixtral-12B-2409", os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"),
"mistral-community/pixtral-12b", os.path.join(models_path_prefix, "mistral-community/pixtral-12b"),
"Qwen/Qwen-VL-Chat", os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"),
"Qwen/Qwen2-VL-2B-Instruct", os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"),
"Qwen/Qwen2.5-VL-3B-Instruct", os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"),
"Qwen/Qwen2-Audio-7B-Instruct", os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct"),
"Qwen/Qwen2.5-Omni-7B", os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-7B"),
"Skywork/Skywork-R1V-38B", os.path.join(models_path_prefix, "Skywork/Skywork-R1V-38B"),
"fixie-ai/ultravox-v0_5-llama-3_2-1b", os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"),
"openai/whisper-large-v3", os.path.join(models_path_prefix, "openai/whisper-large-v3"),
"meta-llama/Llama-4-Scout-17B-16E-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
]) ])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("num_batches", [32])
...@@ -315,7 +317,7 @@ def test_processing_correctness( ...@@ -315,7 +317,7 @@ def test_processing_correctness(
# yapf: disable # yapf: disable
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) @pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0]) @pytest.mark.parametrize("simplify_rate", [1.0])
...@@ -363,4 +365,4 @@ def _assert_inputs_equal( ...@@ -363,4 +365,4 @@ def _assert_inputs_equal(
if msg is None: if msg is None:
assert a == b assert a == b
else: else:
assert a == b, msg assert a == b, msg
\ No newline at end of file
...@@ -75,7 +75,7 @@ def _run_check( ...@@ -75,7 +75,7 @@ def _run_check(
assert pixel_shape[0] == total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")]) @pytest.mark.parametrize("model_id", os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"))
@pytest.mark.parametrize( @pytest.mark.parametrize(
"size_factors", "size_factors",
[ [
...@@ -129,4 +129,4 @@ def test_processor_override( ...@@ -129,4 +129,4 @@ def test_processor_override(
min_num, min_num,
max_num, max_num,
hf_processor_mm_kwargs, hf_processor_mm_kwargs,
) )
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Tests for Llama4's multimodal preprocessing kwargs.""" """Tests for Llama4's multimodal preprocessing kwargs."""
import os
import pytest import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
...@@ -8,10 +9,11 @@ from vllm.transformers_utils.tokenizer import encode_tokens ...@@ -8,10 +9,11 @@ from vllm.transformers_utils.tokenizer import encode_tokens
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
from ....utils import models_path_prefix
@pytest.mark.parametrize("model_id", @pytest.mark.parametrize("model_id",
["meta-llama/Llama-4-Scout-17B-16E-Instruct"]) [os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct")])
@pytest.mark.parametrize("mm_processor_kwargs", [{}]) @pytest.mark.parametrize("mm_processor_kwargs", [{}])
@pytest.mark.parametrize("num_imgs", [1, 5]) @pytest.mark.parametrize("num_imgs", [1, 5])
@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False]) @pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
...@@ -81,4 +83,4 @@ def test_processor_override( ...@@ -81,4 +83,4 @@ def test_processor_override(
assert prompt_token_ids.count(config.image_token_index) \ assert prompt_token_ids.count(config.image_token_index) \
== mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
assert mm_kwargs["pixel_values"].shape[0] \ assert mm_kwargs["pixel_values"].shape[0] \
== mm_kwargs["patches_per_image"].sum() == mm_kwargs["patches_per_image"].sum()
\ No newline at end of file
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import itertools import itertools
from functools import partial from functools import partial
import os
import pytest import pytest
from PIL import Image from PIL import Image
from pqdm.threads import pqdm from pqdm.threads import pqdm
...@@ -12,6 +13,7 @@ from vllm.multimodal.parse import ImageSize ...@@ -12,6 +13,7 @@ from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from ...utils import build_model_context from ...utils import build_model_context
from ....utils import models_path_prefix
def _validate_image_max_tokens_one( def _validate_image_max_tokens_one(
...@@ -32,7 +34,7 @@ def _validate_image_max_tokens_one( ...@@ -32,7 +34,7 @@ def _validate_image_max_tokens_one(
@pytest.mark.skip("This test takes around 5 minutes to run. " @pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.") "Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
def test_processor_max_tokens(model_id): def test_processor_max_tokens(model_id):
ctx = build_model_context( ctx = build_model_context(
model_id, model_id,
...@@ -127,7 +129,7 @@ def _test_image_prompt_replacements( ...@@ -127,7 +129,7 @@ def _test_image_prompt_replacements(
raise AssertionError(msg) raise AssertionError(msg)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs): def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context( ctx = build_model_context(
...@@ -153,7 +155,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ...@@ -153,7 +155,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
@pytest.mark.skip("This test takes around 2 hours to run. " @pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.") "Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
@pytest.mark.parametrize("num_imgs", [1]) @pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs): def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context( ctx = build_model_context(
...@@ -179,4 +181,4 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ...@@ -179,4 +181,4 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
processor, processor,
num_imgs=num_imgs, num_imgs=num_imgs,
image_sizes=image_sizes, image_sizes=image_sizes,
) )
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment