Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
...@@ -7,18 +7,21 @@ from typing import Callable, Optional, Union ...@@ -7,18 +7,21 @@ from typing import Callable, Optional, Union
import torch import torch
from vllm.multimodal.audio import AudioResampler
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video, from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video) sample_frames_from_video)
from .....conftest import _ImageAssets, _VideoAssets from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT, TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
ImageSizeWrapper, SizeType, VLMTestInfo) ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
VLMTestInfo)
def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
str], str],
test_placeholder: str) -> str: test_placeholder: str) -> str:
"""Given a prompt, replaces each test placeholder with the """Given a prompt, replaces each test placeholder with the
model-specific tag. model-specific tag.
...@@ -26,7 +29,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], ...@@ -26,7 +29,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
prompt_segments = prompt.split(test_placeholder) prompt_segments = prompt.split(test_placeholder)
img_prompt = prompt_segments[0] img_prompt = prompt_segments[0]
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1): for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
img_prompt += img_idx_to_prompt(placeholder_idx) img_prompt += mm_idx_to_prompt(placeholder_idx)
img_prompt += next_seg img_prompt += next_seg
return img_prompt return img_prompt
...@@ -34,6 +37,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], ...@@ -34,6 +37,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
def get_model_prompts(base_prompts: Iterable[str], def get_model_prompts(base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]], img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]], video_idx_to_prompt: Optional[Callable[[int], str]],
audio_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str]) -> list[str]: prompt_formatter: Callable[[str], str]) -> list[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s) """Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting to be tested, update the media placeholders and apply the prompt formatting
...@@ -60,6 +64,11 @@ def get_model_prompts(base_prompts: Iterable[str], ...@@ -60,6 +64,11 @@ def get_model_prompts(base_prompts: Iterable[str],
video_idx_to_prompt, video_idx_to_prompt,
TEST_VIDEO_PLACEHOLDER) TEST_VIDEO_PLACEHOLDER)
if audio_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
audio_idx_to_prompt,
TEST_AUDIO_PLACEHOLDER)
# Apply the prompt formatter to wrap the base prompt with # Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt # the correct media placeholders to get the model test prompt
model_prompt = prompt_formatter(base_prompt) model_prompt = prompt_formatter(base_prompt)
...@@ -68,10 +77,11 @@ def get_model_prompts(base_prompts: Iterable[str], ...@@ -68,10 +77,11 @@ def get_model_prompts(base_prompts: Iterable[str],
def build_single_image_inputs_from_test_info( def build_single_image_inputs_from_test_info(
test_info: VLMTestInfo, test_info: VLMTestInfo,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper, size_wrapper: ImageSizeWrapper,
tmp_path: Optional[PosixPath] = None): tmp_path: Optional[PosixPath] = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None: if test_info.prompt_formatter is None:
raise ValueError( raise ValueError(
"Prompt formatter must be set to build single image inputs") "Prompt formatter must be set to build single image inputs")
...@@ -79,6 +89,7 @@ def build_single_image_inputs_from_test_info( ...@@ -79,6 +89,7 @@ def build_single_image_inputs_from_test_info(
model_prompts = get_model_prompts(test_info.single_image_prompts, model_prompts = get_model_prompts(test_info.single_image_prompts,
test_info.img_idx_to_prompt, test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt, test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter) test_info.prompt_formatter)
# For models that require a local path / URL encoded in the image; export # For models that require a local path / URL encoded in the image; export
...@@ -97,28 +108,32 @@ def build_single_image_inputs_from_test_info( ...@@ -97,28 +108,32 @@ def build_single_image_inputs_from_test_info(
return build_single_image_inputs(images, model_prompts, size_wrapper) return build_single_image_inputs(images, model_prompts, size_wrapper)
def build_single_image_inputs(images, model_prompts, def build_single_image_inputs(
size_wrapper: ImageSizeWrapper): images, model_prompts,
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
# For every image / prompt pair, get a pair containing two lists of # For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model # length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being # prompt [str], and the second contains copies of the image after being
# scaled by one of the size factors. # scaled by one of the size factors.
# #
# NOTE: rescaling preserves the image aspect ratio. # NOTE: rescaling preserves the image aspect ratio.
return [( return [
[prompt for _ in size_wrapper.data], PromptWithMultiModalInput(
[ prompts=[prompt for _ in size_wrapper.data],
apply_image_size_scaling(image, size, size_wrapper.type) image_data=[
for size in size_wrapper.data apply_image_size_scaling(image, size, size_wrapper.type)
], for size in size_wrapper.data
) for image, prompt in zip(images, model_prompts)] ],
) for image, prompt in zip(images, model_prompts)
]
def build_multi_image_inputs_from_test_info( def build_multi_image_inputs_from_test_info(
test_info: VLMTestInfo, test_info: VLMTestInfo,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper, size_wrapper: ImageSizeWrapper,
tmp_path: Optional[PosixPath] = None): tmp_path: Optional[PosixPath] = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None: if test_info.prompt_formatter is None:
raise ValueError( raise ValueError(
"Prompt formatter must be set to build multi image inputs") "Prompt formatter must be set to build multi image inputs")
...@@ -126,6 +141,7 @@ def build_multi_image_inputs_from_test_info( ...@@ -126,6 +141,7 @@ def build_multi_image_inputs_from_test_info(
model_prompts = get_model_prompts([test_info.multi_image_prompt], model_prompts = get_model_prompts([test_info.multi_image_prompt],
test_info.img_idx_to_prompt, test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt, test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter) test_info.prompt_formatter)
if test_info.prompt_path_encoder is not None: if test_info.prompt_path_encoder is not None:
...@@ -146,20 +162,23 @@ def build_multi_image_inputs_from_test_info( ...@@ -146,20 +162,23 @@ def build_multi_image_inputs_from_test_info(
) )
def build_multi_image_inputs(image_lists, model_prompts, def build_multi_image_inputs(
size_wrapper: ImageSizeWrapper): image_lists, model_prompts,
return [( size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
[prompt for _ in size_wrapper.data], return [
[[ PromptWithMultiModalInput(
apply_image_size_scaling(image, size, size_wrapper.type) prompts=[prompt for _ in size_wrapper.data],
for image in images image_data=[[
] for size in size_wrapper.data], apply_image_size_scaling(image, size, size_wrapper.type)
) for images, prompt in zip(image_lists, model_prompts)] for image in images
] for size in size_wrapper.data],
) for images, prompt in zip(image_lists, model_prompts)
]
def build_embedding_inputs_from_test_info( def build_embedding_inputs_from_test_info(
test_info: VLMTestInfo, test_info: VLMTestInfo,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper, size_wrapper: ImageSizeWrapper,
): ):
# These conditions will always be true if invoked through filtering, # These conditions will always be true if invoked through filtering,
...@@ -177,6 +196,7 @@ def build_embedding_inputs_from_test_info( ...@@ -177,6 +196,7 @@ def build_embedding_inputs_from_test_info(
SINGLE_IMAGE_BASE_PROMPTS, SINGLE_IMAGE_BASE_PROMPTS,
test_info.img_idx_to_prompt, test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt, test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter, test_info.prompt_formatter,
) )
...@@ -192,16 +212,17 @@ def build_embedding_inputs_from_test_info( ...@@ -192,16 +212,17 @@ def build_embedding_inputs_from_test_info(
def build_video_inputs_from_test_info( def build_video_inputs_from_test_info(
test_info: VLMTestInfo, test_info: VLMTestInfo,
video_assets: _VideoAssets, video_assets: VideoTestAssets,
size_wrapper: ImageSizeWrapper, size_wrapper: ImageSizeWrapper,
num_frames: int, num_frames: int,
): ) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None: if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build video inputs") raise ValueError("Prompt formatter must be set to build video inputs")
model_prompts = get_model_prompts( model_prompts = get_model_prompts(
[VIDEO_BASE_PROMPT], [VIDEO_BASE_PROMPT],
test_info.img_idx_to_prompt, test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt, test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter, test_info.prompt_formatter,
) )
...@@ -213,10 +234,14 @@ def build_video_inputs_from_test_info( ...@@ -213,10 +234,14 @@ def build_video_inputs_from_test_info(
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
else rescale_video_size) else rescale_video_size)
return [( return [
[prompt for _ in size_wrapper.data], PromptWithMultiModalInput(
[video_scaler(video, size) for size in size_wrapper.data], prompts=[prompt for _ in size_wrapper.data],
) for video, prompt in zip(sampled_vids, model_prompts)] video_data=[
video_scaler(video, size) for size in size_wrapper.data
],
) for video, prompt in zip(sampled_vids, model_prompts)
]
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
...@@ -236,3 +261,37 @@ def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], ...@@ -236,3 +261,37 @@ def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
# We have a list of fixed sizes # We have a list of fixed sizes
return image.resize(size) return image.resize(size)
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR") raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
def build_audio_inputs_from_test_info(
test_info: VLMTestInfo,
audio_assets: AudioTestAssets,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build audio inputs")
model_prompts = get_model_prompts(
SINGLE_AUDIO_BASE_PROMPT,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
resampler = AudioResampler(
target_sr=16000,
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
) for audio, sr in audios]
return [
PromptWithMultiModalInput(
prompts=model_prompts,
audio_data=resampled_audios,
)
]
...@@ -83,7 +83,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo], ...@@ -83,7 +83,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
test_info.num_video_frames) test_info.num_video_frames)
# No sizes passed for custom inputs, since inputs are directly provided # No sizes passed for custom inputs, since inputs are directly provided
if test_type != VLMTestType.CUSTOM_INPUTS: if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type) wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None: if wrapped_sizes is None:
raise ValueError( raise ValueError(
...@@ -91,7 +91,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo], ...@@ -91,7 +91,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
iter_kwargs["size_wrapper"] = wrapped_sizes iter_kwargs["size_wrapper"] = wrapped_sizes
#Otherwise expand the custom test options instead #Otherwise expand the custom test options instead
else: elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None: if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given") raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
...@@ -136,8 +136,8 @@ def get_wrapped_test_sizes( ...@@ -136,8 +136,8 @@ def get_wrapped_test_sizes(
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor) ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS for factor in EMBEDDING_SIZE_FACTORS
]) ])
# Custom inputs have preprocessed inputs # Audio and Custom inputs have preprocessed inputs
elif test_type == VLMTestType.CUSTOM_INPUTS: elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
return tuple() return tuple()
size_factors = test_info.image_size_factors \ size_factors = test_info.image_size_factors \
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Core test implementation to be shared across modalities.""" """Core test implementation to be shared across modalities."""
from typing import Any, Callable, Optional, Union from typing import Any, Callable, Optional
import torch import torch
from PIL.Image import Image
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import TaskOption
...@@ -11,14 +10,14 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer ...@@ -11,14 +10,14 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import HfRunner, VllmRunner from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS from ....registry import HF_EXAMPLE_MODELS
from .types import RunnerOutput from .types import PromptWithMultiModalInput, RunnerOutput
def run_test( def run_test(
*, *,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
inputs: list[tuple[list[str], list[Union[list[Image], Image]]]], inputs: list[PromptWithMultiModalInput],
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
...@@ -38,7 +37,6 @@ def run_test( ...@@ -38,7 +37,6 @@ def run_test(
hf_model_kwargs: Optional[dict[str, Any]], hf_model_kwargs: Optional[dict[str, Any]],
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
task: TaskOption = "auto", task: TaskOption = "auto",
runner_mm_key: str = "images",
distributed_executor_backend: Optional[str] = None, distributed_executor_backend: Optional[str] = None,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
vllm_embeddings: Optional[torch.Tensor] = None, vllm_embeddings: Optional[torch.Tensor] = None,
...@@ -67,7 +65,7 @@ def run_test( ...@@ -67,7 +65,7 @@ def run_test(
"disable_mm_preprocessor_cache": True, "disable_mm_preprocessor_cache": True,
} }
if model_info.tokenizer: if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
if model_info.tokenizer_mode: if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides: if model_info.hf_overrides:
...@@ -94,10 +92,16 @@ def run_test( ...@@ -94,10 +92,16 @@ def run_test(
if stop_str: if stop_str:
vllm_kwargs["stop"] = stop_str vllm_kwargs["stop"] = stop_str
for prompts, media in vllm_inputs: for prompts, image_data, video_data, audio_data in vllm_inputs:
vllm_kwargs[runner_mm_key] = media mm_data = dict(images=image_data,
videos=video_data,
audios=audio_data)
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
vllm_output = vllm_model.generate_greedy_logprobs( vllm_output = vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs) prompts,
max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs_with_mm_data)
vllm_outputs_per_mm.append(vllm_output) vllm_outputs_per_mm.append(vllm_output)
hf_model = hf_runner(model, hf_model = hf_runner(model,
...@@ -122,14 +126,17 @@ def run_test( ...@@ -122,14 +126,17 @@ def run_test(
if stop_str: if stop_str:
hf_kwargs["stop_strings"] = stop_str hf_kwargs["stop_strings"] = stop_str
for prompts, media in inputs: for prompts, image_data, video_data, audio_data in inputs:
hf_kwargs[runner_mm_key] = media mm_data = dict(images=image_data,
videos=video_data,
audios=audio_data)
hf_kwargs_with_mm_data = hf_kwargs | mm_data
hf_output = hf_model.generate_greedy_logprobs_limit( hf_output = hf_model.generate_greedy_logprobs_limit(
prompts, prompts,
max_tokens, max_tokens,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
tokenizer=tokenizer, tokenizer=tokenizer,
**hf_kwargs) **hf_kwargs_with_mm_data)
hf_outputs_per_mm.append(hf_output) hf_outputs_per_mm.append(hf_output)
# Apply output processing / sanitation to the vLLM and HF runner results # Apply output processing / sanitation to the vLLM and HF runner results
......
...@@ -12,7 +12,7 @@ from vllm.multimodal.video import (rescale_video_size, resize_video, ...@@ -12,7 +12,7 @@ from vllm.multimodal.video import (rescale_video_size, resize_video,
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs from .builders import build_multi_image_inputs, build_single_image_inputs
from .types import ImageSizeWrapper, SizeType from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]): def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
...@@ -32,24 +32,28 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]): ...@@ -32,24 +32,28 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"<image>\nWhat is the season?", "<image>\nWhat is the season?",
] ]
formatted_prompts = [formatter(prompt) for prompt in img_prompts] formatted_prompts = [formatter(prompt) for prompt in img_prompts]
aspect_ratio_images = [
return [( [stop_sign, cherry_blossom],
formatted_prompts, # Images with different sizes and aspect-ratios
[
rescale_image_size(stop_sign, 0.1),
stop_sign,
],
[ [
[stop_sign, cherry_blossom], stop_sign,
# Images with different sizes and aspect-ratios rescale_image_size(stop_sign, 0.25),
[ cherry_blossom.resize((183, 488)),
rescale_image_size(stop_sign, 0.1), cherry_blossom.resize((488, 183))
stop_sign, ],
], cherry_blossom,
[ ]
stop_sign,
rescale_image_size(stop_sign, 0.25), return [
cherry_blossom.resize((183, 488)), PromptWithMultiModalInput(
cherry_blossom.resize((488, 183)) prompts=formatted_prompts,
], image_data=aspect_ratio_images,
cherry_blossom, )
])] ]
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str], def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
...@@ -68,24 +72,28 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str], ...@@ -68,24 +72,28 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
"<video>\nWhy is this video funny?", "<video>\nWhy is this video funny?",
] ]
formatted_prompts = [formatter(prompt) for prompt in video_prompts] formatted_prompts = [formatter(prompt) for prompt in video_prompts]
aspect_ratio_videos = [
return [( [video, video],
formatted_prompts, # Videos with different sizes and aspect-ratios
[ [
[video, video], rescale_video_size(video, 0.1),
# Videos with different sizes and aspect-ratios
[
rescale_video_size(video, 0.1),
video,
],
[
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183))
],
video, video,
])] ],
[
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183))
],
video,
]
return [
PromptWithMultiModalInput(
prompts=formatted_prompts,
video_data=aspect_ratio_videos,
)
]
def different_patch_input_cases_internvl(): def different_patch_input_cases_internvl():
......
...@@ -16,7 +16,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature, ...@@ -16,7 +16,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.transformers_utils.tokenizer import patch_padding_side
from .....conftest import HfRunner, ImageAsset, _ImageAssets from .....conftest import HfRunner, ImageAsset, ImageTestAssets
from .types import RunnerOutput from .types import RunnerOutput
...@@ -229,15 +229,35 @@ def minicpmv_trunc_hf_output(hf_output: RunnerOutput, ...@@ -229,15 +229,35 @@ def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
return output_ids, output_str, out_logprobs return output_ids, output_str, out_logprobs
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end_of_sentence>"):
output_str = output_str.split("<end_of_sentence>")[0]
return output_ids, output_str, out_logprobs
def ultravox_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id
eos_token = tokenizer.decode(eos_token_id)
if output_str.endswith(eos_token):
output_str = output_str.split(eos_token)[0]
return output_ids, output_str, out_logprobs
####### Functions for converting image assets to embeddings ####### Functions for converting image assets to embeddings
def get_llava_embeddings(image_assets: _ImageAssets): def get_llava_embeddings(image_assets: ImageTestAssets):
return [asset.image_embeds for asset in image_assets] return [asset.image_embeds for asset in image_assets]
####### Prompt path encoders for models that need models on disk ####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder( def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], tmp_path: PosixPath, prompt: str,
_ImageAssets]) -> str: assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
"""Given a temporary dir path, export one or more image assets into the """Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that tempdir & replace its contents with the local path to the string so that
the HF version of Qwen-VL can resolve the path and load the image in its the HF version of Qwen-VL can resolve the path and load the image in its
...@@ -627,6 +647,17 @@ def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -627,6 +647,17 @@ def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model return hf_model
def minimax_vl_01_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
orig_generate = hf_model.model.generate
def _generate(self, *args, image_sizes=None, **kwargs):
return orig_generate(*args, decode_text=False, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model
def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Molmo.""" """Patches and returns an instance of the HfRunner to use for Molmo."""
hf_processor = hf_model.processor hf_processor = hf_model.processor
...@@ -657,3 +688,46 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -657,3 +688,46 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model.model.generate = types.MethodType(_generate, hf_model.model) hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model return hf_model
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.llm.get_output_embeddings()
def processor(*args, text="", images=None, **kwargs):
text_tokenizer = hf_model.model.get_text_tokenizer()
images = [images] if isinstance(images, Image) else images
prompt_start_and_end = {
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
"llama":
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
}
for start, end in prompt_start_and_end.values():
if start in text and end in text:
text = text.split(start)[1].split(end)[0]
break
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
text_or_conversations=text, images=images)
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
inputs = {
"inputs": input_ids.unsqueeze(0),
"pixel_values": pixel_values.unsqueeze(0),
"attention_mask": attention_mask.unsqueeze(0),
}
return BatchFeature(data=inputs, tensor_type="pt")
hf_model.processor = processor
return hf_model
def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
thinker = hf_model.model.thinker
thinker.get_output_embeddings = lambda: thinker.lm_head
hf_model.model = thinker
return hf_model
...@@ -4,7 +4,8 @@ types / modalities. ...@@ -4,7 +4,8 @@ types / modalities.
""" """
from pathlib import PosixPath from pathlib import PosixPath
from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
VideoTestAssets, VllmRunner)
from . import builders, core from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo from .types import ExpandableVLMTestArgs, VLMTestInfo
...@@ -14,7 +15,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, ...@@ -14,7 +15,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: ImageTestAssets):
assert test_case.size_wrapper is not None assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info( inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path) model_test_info, image_assets, test_case.size_wrapper, tmp_path)
...@@ -29,7 +30,6 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, ...@@ -29,7 +30,6 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs, num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend, distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs()) **model_test_info.get_non_parametrized_runner_kwargs())
...@@ -37,7 +37,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, ...@@ -37,7 +37,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: ImageTestAssets):
assert test_case.size_wrapper is not None assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info( inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path) model_test_info, image_assets, test_case.size_wrapper, tmp_path)
...@@ -52,7 +52,6 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, ...@@ -52,7 +52,6 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs, num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)}, limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend, distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs()) **model_test_info.get_non_parametrized_runner_kwargs())
...@@ -60,7 +59,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo, ...@@ -60,7 +59,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: ImageTestAssets):
assert test_case.size_wrapper is not None assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info( inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper) model_test_info, image_assets, test_case.size_wrapper)
...@@ -76,7 +75,6 @@ def run_embedding_test(*, model_test_info: VLMTestInfo, ...@@ -76,7 +75,6 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings, vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend, distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs()) **model_test_info.get_non_parametrized_runner_kwargs())
...@@ -86,7 +84,7 @@ def run_video_test( ...@@ -86,7 +84,7 @@ def run_video_test(
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets, video_assets: VideoTestAssets,
): ):
assert test_case.size_wrapper is not None assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None assert test_case.num_video_frames is not None
...@@ -104,7 +102,30 @@ def run_video_test( ...@@ -104,7 +102,30 @@ def run_video_test(
num_logprobs=test_case.num_logprobs, num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)}, limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend, distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key="videos", **model_test_info.get_non_parametrized_runner_kwargs())
def run_audio_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
inputs = builders.build_audio_inputs_from_test_info(
model_test_info, audio_assets)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"audio": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs()) **model_test_info.get_non_parametrized_runner_kwargs())
...@@ -119,11 +140,9 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo, ...@@ -119,11 +140,9 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
inputs = test_case.custom_test_opts.inputs inputs = test_case.custom_test_opts.inputs
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
runner_mm_key = test_case.custom_test_opts.runner_mm_key # Inputs and limit_mm_per_prompt should all be set
# Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
assert inputs is not None assert inputs is not None
assert limit_mm_per_prompt is not None assert limit_mm_per_prompt is not None
assert runner_mm_key is not None
core.run_test( core.run_test(
hf_runner=hf_runner, hf_runner=hf_runner,
...@@ -135,5 +154,4 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo, ...@@ -135,5 +154,4 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs, num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend, distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key=runner_mm_key,
**model_test_info.get_non_parametrized_runner_kwargs()) **model_test_info.get_non_parametrized_runner_kwargs())
...@@ -6,7 +6,6 @@ from pathlib import PosixPath ...@@ -6,7 +6,6 @@ from pathlib import PosixPath
from typing import Any, Callable, NamedTuple, Optional, Union from typing import Any, Callable, NamedTuple, Optional, Union
import torch import torch
from PIL.Image import Image
from pytest import MarkDecorator from pytest import MarkDecorator
from transformers import AutoModelForCausalLM from transformers import AutoModelForCausalLM
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
...@@ -15,18 +14,25 @@ from vllm.config import TaskOption ...@@ -15,18 +14,25 @@ from vllm.config import TaskOption
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
ImageTestAssets, PromptAudioInput, PromptImageInput,
PromptVideoInput)
from ....utils import check_logprobs_close from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model # meta image tag; will be replaced by the appropriate tag for the model
TEST_IMG_PLACEHOLDER = "<vlm_image>" TEST_IMG_PLACEHOLDER = "<vlm_image>"
TEST_VIDEO_PLACEHOLDER = "<vlm_video>" TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
# yapf: disable # yapf: disable
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({ SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?", "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?", "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
}) })
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts({
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
})
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501 MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?" VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
...@@ -38,12 +44,21 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]] ...@@ -38,12 +44,21 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
# yapf: enable # yapf: enable
class PromptWithMultiModalInput(NamedTuple):
"""Holds the multimodal input for a single test case."""
prompts: list[str]
image_data: Optional[PromptImageInput] = None
video_data: Optional[PromptVideoInput] = None
audio_data: Optional[PromptAudioInput] = None
class VLMTestType(Enum): class VLMTestType(Enum):
IMAGE = 1 IMAGE = 1
MULTI_IMAGE = 2 MULTI_IMAGE = 2
EMBEDDING = 3 EMBEDDING = 3
VIDEO = 4 VIDEO = 4
CUSTOM_INPUTS = 5 AUDIO = 5
CUSTOM_INPUTS = 6
class SizeType(Enum): class SizeType(Enum):
...@@ -52,10 +67,8 @@ class SizeType(Enum): ...@@ -52,10 +67,8 @@ class SizeType(Enum):
class CustomTestOptions(NamedTuple): class CustomTestOptions(NamedTuple):
inputs: list[tuple[list[str], list[Union[list[Image], Image]]]] inputs: list[PromptWithMultiModalInput]
limit_mm_per_prompt: dict[str, int] limit_mm_per_prompt: dict[str, int]
# kwarg to pass multimodal data in as to vllm/hf runner instances.
runner_mm_key: str = "images"
class ImageSizeWrapper(NamedTuple): class ImageSizeWrapper(NamedTuple):
...@@ -75,6 +88,7 @@ class VLMTestInfo(NamedTuple): ...@@ -75,6 +88,7 @@ class VLMTestInfo(NamedTuple):
prompt_formatter: Optional[Callable[[str], str]] = None prompt_formatter: Optional[Callable[[str], str]] = None
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n" img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n" video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
# Most models work on the single / multi-image prompts above, but in some # Most models work on the single / multi-image prompts above, but in some
# cases the log prob check fails, e.g., for paligemma. We allow passing # cases the log prob check fails, e.g., for paligemma. We allow passing
...@@ -85,7 +99,7 @@ class VLMTestInfo(NamedTuple): ...@@ -85,7 +99,7 @@ class VLMTestInfo(NamedTuple):
# Function for converting ImageAssets to image embeddings; # Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests # We need to define this explicitly for embedding tests
convert_assets_to_embeddings: Optional[Callable[[_ImageAssets], convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
torch.Tensor]] = None torch.Tensor]] = None
# Exposed options for vLLM runner; we change these in a several tests, # Exposed options for vLLM runner; we change these in a several tests,
...@@ -141,7 +155,7 @@ class VLMTestInfo(NamedTuple): ...@@ -141,7 +155,7 @@ class VLMTestInfo(NamedTuple):
# for Qwen-VL, which requires encoding the image path / url into the prompt # for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner # for HF runner
prompt_path_encoder: Optional[ prompt_path_encoder: Optional[
Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]], Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
str]] = None # noqa: E501 str]] = None # noqa: E501
# Allows configuring a test to run with custom inputs # Allows configuring a test to run with custom inputs
......
...@@ -10,7 +10,7 @@ from transformers import Qwen2VLForConditionalGeneration ...@@ -10,7 +10,7 @@ from transformers import Qwen2VLForConditionalGeneration
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
HF_TEXT_PROMPTS = [ HF_TEXT_PROMPTS = [
# T -> X # T -> X
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Optional
import pytest import pytest
import torch import torch
import torch.nn as nn import torch.nn as nn
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModel, CLIPImageProcessor from transformers import AutoConfig, AutoModel, CLIPImageProcessor
from ....conftest import _ImageAssets from vllm.distributed import cleanup_dist_env_and_memory
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import ImageTestAssets
# we use snapshot_download to prevent conflicts between # we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner # dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
@torch.inference_mode()
def run_intern_vit_test( def run_intern_vit_test(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
*, *,
dtype: str, dtype: str,
distributed_executor_backend: Optional[str] = None,
): ):
model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN) model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
img_processor = CLIPImageProcessor.from_pretrained(model) img_processor = CLIPImageProcessor.from_pretrained(model)
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
pixel_values = [ pixel_values = [
img_processor(images, return_tensors='pt').pixel_values.to(dtype) img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
for images in images for images in images
] ]
...@@ -36,14 +37,13 @@ def run_intern_vit_test( ...@@ -36,14 +37,13 @@ def run_intern_vit_test(
config.norm_type = "rms_norm" config.norm_type = "rms_norm"
hf_model = AutoModel.from_pretrained(model, hf_model = AutoModel.from_pretrained(model,
torch_dtype=dtype, torch_dtype=torch_dtype,
trust_remote_code=True).to("cuda") trust_remote_code=True).to("cuda")
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).last_hidden_state hf_model(pixel_value.to("cuda")).last_hidden_state
for pixel_value in pixel_values for pixel_value in pixel_values
] ]
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.models.intern_vit import InternVisionModel from vllm.model_executor.models.intern_vit import InternVisionModel
vllm_model = InternVisionModel(config) vllm_model = InternVisionModel(config)
vllm_model.load_weights(hf_model.state_dict().items()) vllm_model.load_weights(hf_model.state_dict().items())
...@@ -51,7 +51,7 @@ def run_intern_vit_test( ...@@ -51,7 +51,7 @@ def run_intern_vit_test(
del hf_model del hf_model
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
vllm_model = vllm_model.to("cuda", dtype) vllm_model = vllm_model.to("cuda", torch_dtype)
vllm_outputs_per_image = [ vllm_outputs_per_image = [
vllm_model(pixel_values=pixel_value.to("cuda")) vllm_model(pixel_values=pixel_value.to("cuda"))
for pixel_value in pixel_values for pixel_value in pixel_values
...@@ -69,8 +69,7 @@ def run_intern_vit_test( ...@@ -69,8 +69,7 @@ def run_intern_vit_test(
"OpenGVLab/InternViT-300M-448px", "OpenGVLab/InternViT-300M-448px",
"OpenGVLab/InternViT-6B-448px-V1-5", "OpenGVLab/InternViT-6B-448px-V1-5",
]) ])
@pytest.mark.parametrize("dtype", [torch.half]) @pytest.mark.parametrize("dtype", ["half"])
@torch.inference_mode()
def test_models(dist_init, image_assets, model_id, dtype: str) -> None: def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
run_intern_vit_test( run_intern_vit_test(
image_assets, image_assets,
......
...@@ -8,7 +8,7 @@ from vllm.platforms import current_platform ...@@ -8,7 +8,7 @@ from vllm.platforms import current_platform
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
# Llava Next embedding implementation is only supported by CUDA. # Llava Next embedding implementation is only supported by CUDA.
# If run on ROCm, hf_model.model.resize_token_embeddings will # If run on ROCm, hf_model.model.resize_token_embeddings will
......
...@@ -9,7 +9,7 @@ from vllm.assets.image import VLM_IMAGES_DIR ...@@ -9,7 +9,7 @@ from vllm.assets.image import VLM_IMAGES_DIR
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
HF_TEXT_PROMPTS = [ HF_TEXT_PROMPTS = [
# T -> X # T -> X
......
...@@ -146,7 +146,8 @@ def _test_processing_correctness_hf( ...@@ -146,7 +146,8 @@ def _test_processing_correctness_hf(
batch_idx: int, batch_idx: int,
ignore_mm_keys: Optional[set[str]] = None, ignore_mm_keys: Optional[set[str]] = None,
): ):
if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"): if model_config.hf_config.model_type in ("mllama", "ovis", "ultravox",
"whisper"):
# For some multimodal models, tokenizer will always add bos_token # For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs # at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here # incorrect token ids. So we need use `add_special_tokens=False` here
...@@ -270,9 +271,13 @@ def _test_processing_correctness_mistral( ...@@ -270,9 +271,13 @@ def _test_processing_correctness_mistral(
"openbmb/MiniCPM-Llama3-V-2_5", "openbmb/MiniCPM-Llama3-V-2_5",
"openbmb/MiniCPM-o-2_6", "openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6", "openbmb/MiniCPM-V-2_6",
"MiniMaxAI/MiniMax-VL-01",
"allenai/Molmo-7B-D-0924", "allenai/Molmo-7B-D-0924",
"allenai/Molmo-7B-O-0924", "allenai/Molmo-7B-O-0924",
"nvidia/NVLM-D-72B", "nvidia/NVLM-D-72B",
"AIDC-AI/Ovis1.6-Gemma2-9B",
"AIDC-AI/Ovis1.6-Llama3.2-3B",
"AIDC-AI/Ovis2-1B",
"google/paligemma-3b-mix-224", "google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448", "google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-4-multimodal-instruct", "microsoft/Phi-4-multimodal-instruct",
...@@ -282,7 +287,7 @@ def _test_processing_correctness_mistral( ...@@ -282,7 +287,7 @@ def _test_processing_correctness_mistral(
"Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/Qwen2.5-Omni-7B", "Qwen/Qwen2.5-Omni-3B",
"Skywork/Skywork-R1V-38B", "Skywork/Skywork-R1V-38B",
"fixie-ai/ultravox-v0_5-llama-3_2-1b", "fixie-ai/ultravox-v0_5-llama-3_2-1b",
"openai/whisper-large-v3", "openai/whisper-large-v3",
......
...@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY ...@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -137,7 +137,7 @@ def _run_check( ...@@ -137,7 +137,7 @@ def _run_check(
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
model_id: str, model_id: str,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_factors: list[int], size_factors: list[int],
min_dynamic_patch: int, min_dynamic_patch: int,
max_dynamic_patch: int, max_dynamic_patch: int,
......
...@@ -5,7 +5,7 @@ from transformers import Idefics3Config ...@@ -5,7 +5,7 @@ from transformers import Idefics3Config
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -21,7 +21,7 @@ from ...utils import build_model_context ...@@ -21,7 +21,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, object], mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int, expected_toks_per_img: int,
......
...@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY ...@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -94,7 +94,7 @@ def _run_check( ...@@ -94,7 +94,7 @@ def _run_check(
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
model_id: str, model_id: str,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_factors: list[int], size_factors: list[int],
min_dynamic_patch: int, min_dynamic_patch: int,
max_dynamic_patch: int, max_dynamic_patch: int,
......
...@@ -6,7 +6,7 @@ import pytest ...@@ -6,7 +6,7 @@ import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.tokenizer import encode_tokens from vllm.transformers_utils.tokenizer import encode_tokens
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -17,7 +17,7 @@ from ...utils import build_model_context ...@@ -17,7 +17,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False]) @pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
@pytest.mark.parametrize("tokenized_prompt", [True, False]) @pytest.mark.parametrize("tokenized_prompt", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict, mm_processor_kwargs: dict,
num_imgs: int, num_imgs: int,
......
# SPDX-License-Identifier: Apache-2.0
import pytest
from PIL import Image
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(
image_assets: ImageTestAssets,
model_id: str,
num_imgs: int,
):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=(364, 364))
mm_data = {"image": [image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=image_size)
mm_data = {"image": [image] * num_imgs}
try:
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
except Exception as exc:
failed_size_excs.append((image_size, exc))
def _test_image_prompt_replacements(
processor,
*,
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
failed_size_excs = list[tuple[ImageSize, Exception]]()
for size in image_sizes:
_validate_image_prompt_replacements_one(processor, num_imgs,
failed_size_excs, size)
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
...@@ -4,7 +4,7 @@ import pytest ...@@ -4,7 +4,7 @@ import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -22,7 +22,7 @@ from ...utils import build_model_context ...@@ -22,7 +22,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, int], mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int, expected_toks_per_img: int,
......
...@@ -4,7 +4,7 @@ import pytest ...@@ -4,7 +4,7 @@ import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -22,7 +22,7 @@ from ...utils import build_model_context ...@@ -22,7 +22,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, int], mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int, expected_toks_per_img: int,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment