Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7a985548
Commit
7a985548
authored
May 22, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.0' into v0.9.0-ori
parents
45d3785c
dc1440cf
Changes
486
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
425 additions
and
143 deletions
+425
-143
tests/models/multimodal/generation/vlm_utils/builders.py
tests/models/multimodal/generation/vlm_utils/builders.py
+98
-39
tests/models/multimodal/generation/vlm_utils/case_filtering.py
.../models/multimodal/generation/vlm_utils/case_filtering.py
+4
-4
tests/models/multimodal/generation/vlm_utils/core.py
tests/models/multimodal/generation/vlm_utils/core.py
+19
-12
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
...s/models/multimodal/generation/vlm_utils/custom_inputs.py
+42
-34
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+78
-4
tests/models/multimodal/generation/vlm_utils/runners.py
tests/models/multimodal/generation/vlm_utils/runners.py
+31
-13
tests/models/multimodal/generation/vlm_utils/types.py
tests/models/multimodal/generation/vlm_utils/types.py
+22
-8
tests/models/multimodal/pooling/__init__.py
tests/models/multimodal/pooling/__init__.py
+0
-0
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+1
-1
tests/models/multimodal/pooling/test_intern_vit.py
tests/models/multimodal/pooling/test_intern_vit.py
+11
-12
tests/models/multimodal/pooling/test_llava_next.py
tests/models/multimodal/pooling/test_llava_next.py
+1
-1
tests/models/multimodal/pooling/test_phi3v.py
tests/models/multimodal/pooling/test_phi3v.py
+1
-1
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+7
-2
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+2
-2
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_idefics3.py
+2
-2
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+2
-2
tests/models/multimodal/processing/test_llama4.py
tests/models/multimodal/processing/test_llama4.py
+2
-2
tests/models/multimodal/processing/test_minimax_vl_01.py
tests/models/multimodal/processing/test_minimax_vl_01.py
+98
-0
tests/models/multimodal/processing/test_phi3v.py
tests/models/multimodal/processing/test_phi3v.py
+2
-2
tests/models/multimodal/processing/test_phi4mm.py
tests/models/multimodal/processing/test_phi4mm.py
+2
-2
No files found.
Too many changes to show.
To preserve performance only
486 of 486+
files are displayed.
Plain diff
Email patch
tests/models/
decoder_only/vision_language
/vlm_utils/builders.py
→
tests/models/
multimodal/generation
/vlm_utils/builders.py
View file @
7a985548
...
@@ -7,18 +7,21 @@ from typing import Callable, Optional, Union
...
@@ -7,18 +7,21 @@ from typing import Callable, Optional, Union
import
torch
import
torch
from
vllm.multimodal.audio
import
AudioResampler
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.video
import
(
rescale_video_size
,
resize_video
,
from
vllm.multimodal.video
import
(
rescale_video_size
,
resize_video
,
sample_frames_from_video
)
sample_frames_from_video
)
from
.....conftest
import
_ImageAssets
,
_VideoAssets
from
.....conftest
import
AudioTestAssets
,
ImageTestAssets
,
VideoTestAssets
from
.types
import
(
SINGLE_IMAGE_BASE_PROMPTS
,
TEST_IMG_PLACEHOLDER
,
from
.types
import
(
SINGLE_AUDIO_BASE_PROMPT
,
SINGLE_IMAGE_BASE_PROMPTS
,
TEST_AUDIO_PLACEHOLDER
,
TEST_IMG_PLACEHOLDER
,
TEST_VIDEO_PLACEHOLDER
,
VIDEO_BASE_PROMPT
,
TEST_VIDEO_PLACEHOLDER
,
VIDEO_BASE_PROMPT
,
ImageSizeWrapper
,
SizeType
,
VLMTestInfo
)
ImageSizeWrapper
,
PromptWithMultiModalInput
,
SizeType
,
VLMTestInfo
)
def
replace_test_placeholder
(
prompt
:
str
,
img
_idx_to_prompt
:
Callable
[[
int
],
def
replace_test_placeholder
(
prompt
:
str
,
mm
_idx_to_prompt
:
Callable
[[
int
],
str
],
str
],
test_placeholder
:
str
)
->
str
:
test_placeholder
:
str
)
->
str
:
"""Given a prompt, replaces each test placeholder with the
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
model-specific tag.
...
@@ -26,7 +29,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
...
@@ -26,7 +29,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
prompt_segments
=
prompt
.
split
(
test_placeholder
)
prompt_segments
=
prompt
.
split
(
test_placeholder
)
img_prompt
=
prompt_segments
[
0
]
img_prompt
=
prompt_segments
[
0
]
for
placeholder_idx
,
next_seg
in
enumerate
(
prompt_segments
[
1
:],
start
=
1
):
for
placeholder_idx
,
next_seg
in
enumerate
(
prompt_segments
[
1
:],
start
=
1
):
img_prompt
+=
img
_idx_to_prompt
(
placeholder_idx
)
img_prompt
+=
mm
_idx_to_prompt
(
placeholder_idx
)
img_prompt
+=
next_seg
img_prompt
+=
next_seg
return
img_prompt
return
img_prompt
...
@@ -34,6 +37,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
...
@@ -34,6 +37,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
def
get_model_prompts
(
base_prompts
:
Iterable
[
str
],
def
get_model_prompts
(
base_prompts
:
Iterable
[
str
],
img_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
img_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
video_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
video_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
audio_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
prompt_formatter
:
Callable
[[
str
],
str
])
->
list
[
str
]:
prompt_formatter
:
Callable
[[
str
],
str
])
->
list
[
str
]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to be tested, update the media placeholders and apply the prompt formatting
...
@@ -60,6 +64,11 @@ def get_model_prompts(base_prompts: Iterable[str],
...
@@ -60,6 +64,11 @@ def get_model_prompts(base_prompts: Iterable[str],
video_idx_to_prompt
,
video_idx_to_prompt
,
TEST_VIDEO_PLACEHOLDER
)
TEST_VIDEO_PLACEHOLDER
)
if
audio_idx_to_prompt
:
base_prompt
=
replace_test_placeholder
(
base_prompt
,
audio_idx_to_prompt
,
TEST_AUDIO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
# the correct media placeholders to get the model test prompt
model_prompt
=
prompt_formatter
(
base_prompt
)
model_prompt
=
prompt_formatter
(
base_prompt
)
...
@@ -68,10 +77,11 @@ def get_model_prompts(base_prompts: Iterable[str],
...
@@ -68,10 +77,11 @@ def get_model_prompts(base_prompts: Iterable[str],
def
build_single_image_inputs_from_test_info
(
def
build_single_image_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
test_info
:
VLMTestInfo
,
image_assets
:
_ImageAssets
,
image_assets
:
ImageTestAssets
,
size_wrapper
:
ImageSizeWrapper
,
size_wrapper
:
ImageSizeWrapper
,
tmp_path
:
Optional
[
PosixPath
]
=
None
):
tmp_path
:
Optional
[
PosixPath
]
=
None
,
)
->
list
[
PromptWithMultiModalInput
]:
if
test_info
.
prompt_formatter
is
None
:
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
raise
ValueError
(
"Prompt formatter must be set to build single image inputs"
)
"Prompt formatter must be set to build single image inputs"
)
...
@@ -79,6 +89,7 @@ def build_single_image_inputs_from_test_info(
...
@@ -79,6 +89,7 @@ def build_single_image_inputs_from_test_info(
model_prompts
=
get_model_prompts
(
test_info
.
single_image_prompts
,
model_prompts
=
get_model_prompts
(
test_info
.
single_image_prompts
,
test_info
.
img_idx_to_prompt
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
)
test_info
.
prompt_formatter
)
# For models that require a local path / URL encoded in the image; export
# For models that require a local path / URL encoded in the image; export
...
@@ -97,28 +108,32 @@ def build_single_image_inputs_from_test_info(
...
@@ -97,28 +108,32 @@ def build_single_image_inputs_from_test_info(
return
build_single_image_inputs
(
images
,
model_prompts
,
size_wrapper
)
return
build_single_image_inputs
(
images
,
model_prompts
,
size_wrapper
)
def
build_single_image_inputs
(
images
,
model_prompts
,
def
build_single_image_inputs
(
size_wrapper
:
ImageSizeWrapper
):
images
,
model_prompts
,
size_wrapper
:
ImageSizeWrapper
)
->
list
[
PromptWithMultiModalInput
]:
# For every image / prompt pair, get a pair containing two lists of
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
# prompt [str], and the second contains copies of the image after being
# scaled by one of the size factors.
# scaled by one of the size factors.
#
#
# NOTE: rescaling preserves the image aspect ratio.
# NOTE: rescaling preserves the image aspect ratio.
return
[(
return
[
[
prompt
for
_
in
size_wrapper
.
data
],
PromptWithMultiModalInput
(
[
prompts
=
[
prompt
for
_
in
size_wrapper
.
data
],
apply_image_size_scaling
(
image
,
size
,
size_wrapper
.
type
)
image_data
=
[
for
size
in
size_wrapper
.
data
apply_image_size_scaling
(
image
,
size
,
size_wrapper
.
type
)
],
for
size
in
size_wrapper
.
data
)
for
image
,
prompt
in
zip
(
images
,
model_prompts
)]
],
)
for
image
,
prompt
in
zip
(
images
,
model_prompts
)
]
def
build_multi_image_inputs_from_test_info
(
def
build_multi_image_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
test_info
:
VLMTestInfo
,
image_assets
:
_ImageAssets
,
image_assets
:
ImageTestAssets
,
size_wrapper
:
ImageSizeWrapper
,
size_wrapper
:
ImageSizeWrapper
,
tmp_path
:
Optional
[
PosixPath
]
=
None
):
tmp_path
:
Optional
[
PosixPath
]
=
None
,
)
->
list
[
PromptWithMultiModalInput
]:
if
test_info
.
prompt_formatter
is
None
:
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
raise
ValueError
(
"Prompt formatter must be set to build multi image inputs"
)
"Prompt formatter must be set to build multi image inputs"
)
...
@@ -126,6 +141,7 @@ def build_multi_image_inputs_from_test_info(
...
@@ -126,6 +141,7 @@ def build_multi_image_inputs_from_test_info(
model_prompts
=
get_model_prompts
([
test_info
.
multi_image_prompt
],
model_prompts
=
get_model_prompts
([
test_info
.
multi_image_prompt
],
test_info
.
img_idx_to_prompt
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
)
test_info
.
prompt_formatter
)
if
test_info
.
prompt_path_encoder
is
not
None
:
if
test_info
.
prompt_path_encoder
is
not
None
:
...
@@ -146,20 +162,23 @@ def build_multi_image_inputs_from_test_info(
...
@@ -146,20 +162,23 @@ def build_multi_image_inputs_from_test_info(
)
)
def
build_multi_image_inputs
(
image_lists
,
model_prompts
,
def
build_multi_image_inputs
(
size_wrapper
:
ImageSizeWrapper
):
image_lists
,
model_prompts
,
return
[(
size_wrapper
:
ImageSizeWrapper
)
->
list
[
PromptWithMultiModalInput
]:
[
prompt
for
_
in
size_wrapper
.
data
],
return
[
[[
PromptWithMultiModalInput
(
apply_image_size_scaling
(
image
,
size
,
size_wrapper
.
type
)
prompts
=
[
prompt
for
_
in
size_wrapper
.
data
],
for
image
in
images
image_data
=
[[
]
for
size
in
size_wrapper
.
data
],
apply_image_size_scaling
(
image
,
size
,
size_wrapper
.
type
)
)
for
images
,
prompt
in
zip
(
image_lists
,
model_prompts
)]
for
image
in
images
]
for
size
in
size_wrapper
.
data
],
)
for
images
,
prompt
in
zip
(
image_lists
,
model_prompts
)
]
def
build_embedding_inputs_from_test_info
(
def
build_embedding_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
test_info
:
VLMTestInfo
,
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
size_wrapper
:
ImageSizeWrapper
,
size_wrapper
:
ImageSizeWrapper
,
):
):
# These conditions will always be true if invoked through filtering,
# These conditions will always be true if invoked through filtering,
...
@@ -177,6 +196,7 @@ def build_embedding_inputs_from_test_info(
...
@@ -177,6 +196,7 @@ def build_embedding_inputs_from_test_info(
SINGLE_IMAGE_BASE_PROMPTS
,
SINGLE_IMAGE_BASE_PROMPTS
,
test_info
.
img_idx_to_prompt
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
test_info
.
prompt_formatter
,
)
)
...
@@ -192,16 +212,17 @@ def build_embedding_inputs_from_test_info(
...
@@ -192,16 +212,17 @@ def build_embedding_inputs_from_test_info(
def
build_video_inputs_from_test_info
(
def
build_video_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
test_info
:
VLMTestInfo
,
video_assets
:
_
VideoAssets
,
video_assets
:
Video
Test
Assets
,
size_wrapper
:
ImageSizeWrapper
,
size_wrapper
:
ImageSizeWrapper
,
num_frames
:
int
,
num_frames
:
int
,
):
)
->
list
[
PromptWithMultiModalInput
]
:
if
test_info
.
prompt_formatter
is
None
:
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build video inputs"
)
raise
ValueError
(
"Prompt formatter must be set to build video inputs"
)
model_prompts
=
get_model_prompts
(
model_prompts
=
get_model_prompts
(
[
VIDEO_BASE_PROMPT
],
[
VIDEO_BASE_PROMPT
],
test_info
.
img_idx_to_prompt
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
test_info
.
prompt_formatter
,
)
)
...
@@ -213,10 +234,14 @@ def build_video_inputs_from_test_info(
...
@@ -213,10 +234,14 @@ def build_video_inputs_from_test_info(
video_scaler
=
(
resize_video
if
size_wrapper
.
type
==
SizeType
.
FIXED_SIZE
video_scaler
=
(
resize_video
if
size_wrapper
.
type
==
SizeType
.
FIXED_SIZE
else
rescale_video_size
)
else
rescale_video_size
)
return
[(
return
[
[
prompt
for
_
in
size_wrapper
.
data
],
PromptWithMultiModalInput
(
[
video_scaler
(
video
,
size
)
for
size
in
size_wrapper
.
data
],
prompts
=
[
prompt
for
_
in
size_wrapper
.
data
],
)
for
video
,
prompt
in
zip
(
sampled_vids
,
model_prompts
)]
video_data
=
[
video_scaler
(
video
,
size
)
for
size
in
size_wrapper
.
data
],
)
for
video
,
prompt
in
zip
(
sampled_vids
,
model_prompts
)
]
def
apply_image_size_scaling
(
image
,
size
:
Union
[
float
,
tuple
[
int
,
int
]],
def
apply_image_size_scaling
(
image
,
size
:
Union
[
float
,
tuple
[
int
,
int
]],
...
@@ -236,3 +261,37 @@ def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
...
@@ -236,3 +261,37 @@ def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
# We have a list of fixed sizes
# We have a list of fixed sizes
return
image
.
resize
(
size
)
return
image
.
resize
(
size
)
raise
ValueError
(
"ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR"
)
raise
ValueError
(
"ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR"
)
def
build_audio_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
audio_assets
:
AudioTestAssets
,
)
->
list
[
PromptWithMultiModalInput
]:
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build audio inputs"
)
model_prompts
=
get_model_prompts
(
SINGLE_AUDIO_BASE_PROMPT
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
)
resampler
=
AudioResampler
(
target_sr
=
16000
,
method
=
"librosa"
,
)
audios
=
[
asset
.
audio_and_sample_rate
for
asset
in
audio_assets
]
resampled_audios
=
[(
resampler
.
resample
(
audio
,
orig_sr
=
sr
,
),
int
(
resampler
.
target_sr
),
)
for
audio
,
sr
in
audios
]
return
[
PromptWithMultiModalInput
(
prompts
=
model_prompts
,
audio_data
=
resampled_audios
,
)
]
tests/models/
decoder_only/vision_language
/vlm_utils/case_filtering.py
→
tests/models/
multimodal/generation
/vlm_utils/case_filtering.py
View file @
7a985548
...
@@ -83,7 +83,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
...
@@ -83,7 +83,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
test_info
.
num_video_frames
)
test_info
.
num_video_frames
)
# No sizes passed for custom inputs, since inputs are directly provided
# No sizes passed for custom inputs, since inputs are directly provided
if
test_type
!=
VLMTestType
.
CUSTOM_INPUTS
:
if
test_type
not
in
(
VLMTestType
.
CUSTOM_INPUTS
,
VLMTestType
.
AUDIO
)
:
wrapped_sizes
=
get_wrapped_test_sizes
(
test_info
,
test_type
)
wrapped_sizes
=
get_wrapped_test_sizes
(
test_info
,
test_type
)
if
wrapped_sizes
is
None
:
if
wrapped_sizes
is
None
:
raise
ValueError
(
raise
ValueError
(
...
@@ -91,7 +91,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
...
@@ -91,7 +91,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
iter_kwargs
[
"size_wrapper"
]
=
wrapped_sizes
iter_kwargs
[
"size_wrapper"
]
=
wrapped_sizes
#Otherwise expand the custom test options instead
#Otherwise expand the custom test options instead
el
se
:
el
if
test_type
==
VLMTestType
.
CUSTOM_INPUTS
:
if
test_info
.
custom_test_opts
is
None
:
if
test_info
.
custom_test_opts
is
None
:
raise
ValueError
(
"Test has type CUSTOM_INPUTS, but none given"
)
raise
ValueError
(
"Test has type CUSTOM_INPUTS, but none given"
)
iter_kwargs
[
"custom_test_opts"
]
=
test_info
.
custom_test_opts
iter_kwargs
[
"custom_test_opts"
]
=
test_info
.
custom_test_opts
...
@@ -136,8 +136,8 @@ def get_wrapped_test_sizes(
...
@@ -136,8 +136,8 @@ def get_wrapped_test_sizes(
ImageSizeWrapper
(
type
=
SizeType
.
SIZE_FACTOR
,
data
=
factor
)
ImageSizeWrapper
(
type
=
SizeType
.
SIZE_FACTOR
,
data
=
factor
)
for
factor
in
EMBEDDING_SIZE_FACTORS
for
factor
in
EMBEDDING_SIZE_FACTORS
])
])
# Custom inputs have preprocessed inputs
#
Audio and
Custom inputs have preprocessed inputs
elif
test_type
==
VLMTestType
.
CUSTOM_INPUTS
:
elif
test_type
in
(
VLMTestType
.
AUDIO
,
VLMTestType
.
CUSTOM_INPUTS
)
:
return
tuple
()
return
tuple
()
size_factors
=
test_info
.
image_size_factors
\
size_factors
=
test_info
.
image_size_factors
\
...
...
tests/models/
decoder_only/vision_language
/vlm_utils/core.py
→
tests/models/
multimodal/generation
/vlm_utils/core.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""Core test implementation to be shared across modalities."""
"""Core test implementation to be shared across modalities."""
from
typing
import
Any
,
Callable
,
Optional
,
Union
from
typing
import
Any
,
Callable
,
Optional
import
torch
import
torch
from
PIL.Image
import
Image
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.config
import
TaskOption
...
@@ -11,14 +10,14 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
...
@@ -11,14 +10,14 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
from
.....conftest
import
HfRunner
,
VllmRunner
from
.....conftest
import
HfRunner
,
VllmRunner
from
....registry
import
HF_EXAMPLE_MODELS
from
....registry
import
HF_EXAMPLE_MODELS
from
.types
import
RunnerOutput
from
.types
import
PromptWithMultiModalInput
,
RunnerOutput
def
run_test
(
def
run_test
(
*
,
*
,
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
list
[
tuple
[
list
[
str
],
list
[
Union
[
list
[
Image
],
Image
]]]
],
inputs
:
list
[
PromptWithMultiModalInput
],
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
...
@@ -38,7 +37,6 @@ def run_test(
...
@@ -38,7 +37,6 @@ def run_test(
hf_model_kwargs
:
Optional
[
dict
[
str
,
Any
]],
hf_model_kwargs
:
Optional
[
dict
[
str
,
Any
]],
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]],
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]],
task
:
TaskOption
=
"auto"
,
task
:
TaskOption
=
"auto"
,
runner_mm_key
:
str
=
"images"
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
tensor_parallel_size
:
int
=
1
,
tensor_parallel_size
:
int
=
1
,
vllm_embeddings
:
Optional
[
torch
.
Tensor
]
=
None
,
vllm_embeddings
:
Optional
[
torch
.
Tensor
]
=
None
,
...
@@ -67,7 +65,7 @@ def run_test(
...
@@ -67,7 +65,7 @@ def run_test(
"disable_mm_preprocessor_cache"
:
True
,
"disable_mm_preprocessor_cache"
:
True
,
}
}
if
model_info
.
tokenizer
:
if
model_info
.
tokenizer
:
vllm_runner_kwargs_
[
"tokenizer"
]
=
model_info
.
tokenizer
vllm_runner_kwargs_
[
"tokenizer
_name
"
]
=
model_info
.
tokenizer
if
model_info
.
tokenizer_mode
:
if
model_info
.
tokenizer_mode
:
vllm_runner_kwargs_
[
"tokenizer_mode"
]
=
model_info
.
tokenizer_mode
vllm_runner_kwargs_
[
"tokenizer_mode"
]
=
model_info
.
tokenizer_mode
if
model_info
.
hf_overrides
:
if
model_info
.
hf_overrides
:
...
@@ -94,10 +92,16 @@ def run_test(
...
@@ -94,10 +92,16 @@ def run_test(
if
stop_str
:
if
stop_str
:
vllm_kwargs
[
"stop"
]
=
stop_str
vllm_kwargs
[
"stop"
]
=
stop_str
for
prompts
,
media
in
vllm_inputs
:
for
prompts
,
image_data
,
video_data
,
audio_data
in
vllm_inputs
:
vllm_kwargs
[
runner_mm_key
]
=
media
mm_data
=
dict
(
images
=
image_data
,
videos
=
video_data
,
audios
=
audio_data
)
vllm_kwargs_with_mm_data
=
vllm_kwargs
|
mm_data
vllm_output
=
vllm_model
.
generate_greedy_logprobs
(
vllm_output
=
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
**
vllm_kwargs
)
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
**
vllm_kwargs_with_mm_data
)
vllm_outputs_per_mm
.
append
(
vllm_output
)
vllm_outputs_per_mm
.
append
(
vllm_output
)
hf_model
=
hf_runner
(
model
,
hf_model
=
hf_runner
(
model
,
...
@@ -122,14 +126,17 @@ def run_test(
...
@@ -122,14 +126,17 @@ def run_test(
if
stop_str
:
if
stop_str
:
hf_kwargs
[
"stop_strings"
]
=
stop_str
hf_kwargs
[
"stop_strings"
]
=
stop_str
for
prompts
,
media
in
inputs
:
for
prompts
,
image_data
,
video_data
,
audio_data
in
inputs
:
hf_kwargs
[
runner_mm_key
]
=
media
mm_data
=
dict
(
images
=
image_data
,
videos
=
video_data
,
audios
=
audio_data
)
hf_kwargs_with_mm_data
=
hf_kwargs
|
mm_data
hf_output
=
hf_model
.
generate_greedy_logprobs_limit
(
hf_output
=
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
prompts
,
max_tokens
,
max_tokens
,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
**
hf_kwargs
)
**
hf_kwargs
_with_mm_data
)
hf_outputs_per_mm
.
append
(
hf_output
)
hf_outputs_per_mm
.
append
(
hf_output
)
# Apply output processing / sanitation to the vLLM and HF runner results
# Apply output processing / sanitation to the vLLM and HF runner results
...
...
tests/models/
decoder_only/vision_language
/vlm_utils/custom_inputs.py
→
tests/models/
multimodal/generation
/vlm_utils/custom_inputs.py
View file @
7a985548
...
@@ -12,7 +12,7 @@ from vllm.multimodal.video import (rescale_video_size, resize_video,
...
@@ -12,7 +12,7 @@ from vllm.multimodal.video import (rescale_video_size, resize_video,
from
.....conftest
import
IMAGE_ASSETS
,
VIDEO_ASSETS
from
.....conftest
import
IMAGE_ASSETS
,
VIDEO_ASSETS
from
.builders
import
build_multi_image_inputs
,
build_single_image_inputs
from
.builders
import
build_multi_image_inputs
,
build_single_image_inputs
from
.types
import
ImageSizeWrapper
,
SizeType
from
.types
import
ImageSizeWrapper
,
PromptWithMultiModalInput
,
SizeType
def
multi_image_multi_aspect_ratio_inputs
(
formatter
:
Callable
[[
str
],
str
]):
def
multi_image_multi_aspect_ratio_inputs
(
formatter
:
Callable
[[
str
],
str
]):
...
@@ -32,24 +32,28 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
...
@@ -32,24 +32,28 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"<image>
\n
What is the season?"
,
"<image>
\n
What is the season?"
,
]
]
formatted_prompts
=
[
formatter
(
prompt
)
for
prompt
in
img_prompts
]
formatted_prompts
=
[
formatter
(
prompt
)
for
prompt
in
img_prompts
]
aspect_ratio_images
=
[
return
[(
[
stop_sign
,
cherry_blossom
],
formatted_prompts
,
# Images with different sizes and aspect-ratios
[
rescale_image_size
(
stop_sign
,
0.1
),
stop_sign
,
],
[
[
[
stop_sign
,
cherry_blossom
],
stop_sign
,
# Images with different sizes and aspect-ratios
rescale_image_size
(
stop_sign
,
0.25
),
[
cherry_blossom
.
resize
((
183
,
488
)),
rescale_image_size
(
stop_sign
,
0.1
),
cherry_blossom
.
resize
((
488
,
183
))
stop_sign
,
]
,
]
,
cherry_blossom
,
[
]
stop_sign
,
rescale_image_size
(
stop_sign
,
0.25
),
return
[
cherry_blossom
.
resize
((
183
,
488
)),
PromptWithMultiModalInput
(
cherry_blossom
.
resize
((
488
,
183
))
prompts
=
formatted_prompts
,
]
,
image_data
=
aspect_ratio_images
,
cherry_blossom
,
)
])
]
]
def
multi_video_multi_aspect_ratio_inputs
(
formatter
:
Callable
[[
str
],
str
],
def
multi_video_multi_aspect_ratio_inputs
(
formatter
:
Callable
[[
str
],
str
],
...
@@ -68,24 +72,28 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
...
@@ -68,24 +72,28 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
"<video>
\n
Why is this video funny?"
,
"<video>
\n
Why is this video funny?"
,
]
]
formatted_prompts
=
[
formatter
(
prompt
)
for
prompt
in
video_prompts
]
formatted_prompts
=
[
formatter
(
prompt
)
for
prompt
in
video_prompts
]
aspect_ratio_videos
=
[
return
[(
[
video
,
video
],
formatted_prompts
,
# Videos with different sizes and aspect-ratios
[
[
[
video
,
video
],
rescale_video_size
(
video
,
0.1
),
# Videos with different sizes and aspect-ratios
[
rescale_video_size
(
video
,
0.1
),
video
,
],
[
video
,
rescale_video_size
(
video
,
0.25
),
resize_video
(
video
,
(
183
,
488
)),
resize_video
(
video
,
(
488
,
183
))
],
video
,
video
,
])]
],
[
video
,
rescale_video_size
(
video
,
0.25
),
resize_video
(
video
,
(
183
,
488
)),
resize_video
(
video
,
(
488
,
183
))
],
video
,
]
return
[
PromptWithMultiModalInput
(
prompts
=
formatted_prompts
,
video_data
=
aspect_ratio_videos
,
)
]
def
different_patch_input_cases_internvl
():
def
different_patch_input_cases_internvl
():
...
...
tests/models/
decoder_only/vision_language
/vlm_utils/model_utils.py
→
tests/models/
multimodal/generation
/vlm_utils/model_utils.py
View file @
7a985548
...
@@ -16,7 +16,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
...
@@ -16,7 +16,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
.....conftest
import
HfRunner
,
ImageAsset
,
_
ImageAssets
from
.....conftest
import
HfRunner
,
ImageAsset
,
Image
Test
Assets
from
.types
import
RunnerOutput
from
.types
import
RunnerOutput
...
@@ -229,15 +229,35 @@ def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
...
@@ -229,15 +229,35 @@ def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
return
output_ids
,
output_str
,
out_logprobs
return
output_ids
,
output_str
,
out_logprobs
def
minimax_vl_01_hf_output
(
hf_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
output_ids
,
output_str
,
out_logprobs
=
hf_output
if
output_str
.
endswith
(
"<end_of_sentence>"
):
output_str
=
output_str
.
split
(
"<end_of_sentence>"
)[
0
]
return
output_ids
,
output_str
,
out_logprobs
def
ultravox_trunc_hf_output
(
hf_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
output_ids
,
output_str
,
out_logprobs
=
hf_output
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
eos_token
=
tokenizer
.
decode
(
eos_token_id
)
if
output_str
.
endswith
(
eos_token
):
output_str
=
output_str
.
split
(
eos_token
)[
0
]
return
output_ids
,
output_str
,
out_logprobs
####### Functions for converting image assets to embeddings
####### Functions for converting image assets to embeddings
def
get_llava_embeddings
(
image_assets
:
_
ImageAssets
):
def
get_llava_embeddings
(
image_assets
:
Image
Test
Assets
):
return
[
asset
.
image_embeds
for
asset
in
image_assets
]
return
[
asset
.
image_embeds
for
asset
in
image_assets
]
####### Prompt path encoders for models that need models on disk
####### Prompt path encoders for models that need models on disk
def
qwen_prompt_path_encoder
(
def
qwen_prompt_path_encoder
(
tmp_path
:
PosixPath
,
prompt
:
str
,
assets
:
Union
[
list
[
ImageAsset
],
tmp_path
:
PosixPath
,
prompt
:
str
,
_
ImageAssets
])
->
str
:
assets
:
Union
[
list
[
ImageAsset
],
Image
Test
Assets
])
->
str
:
"""Given a temporary dir path, export one or more image assets into the
"""Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that
tempdir & replace its contents with the local path to the string so that
the HF version of Qwen-VL can resolve the path and load the image in its
the HF version of Qwen-VL can resolve the path and load the image in its
...
@@ -627,6 +647,17 @@ def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -627,6 +647,17 @@ def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
return
hf_model
def
minimax_vl_01_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
orig_generate
=
hf_model
.
model
.
generate
def
_generate
(
self
,
*
args
,
image_sizes
=
None
,
**
kwargs
):
return
orig_generate
(
*
args
,
decode_text
=
False
,
**
kwargs
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
def
molmo_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
molmo_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for Molmo."""
"""Patches and returns an instance of the HfRunner to use for Molmo."""
hf_processor
=
hf_model
.
processor
hf_processor
=
hf_model
.
processor
...
@@ -657,3 +688,46 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -657,3 +688,46 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
return
hf_model
def
ovis_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
llm
.
get_output_embeddings
()
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
**
kwargs
):
text_tokenizer
=
hf_model
.
model
.
get_text_tokenizer
()
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
prompt_start_and_end
=
{
"qwen2"
:
(
"<|im_start|>user
\n
"
,
"<|im_end|>
\n
"
),
"llama"
:
(
"<|start_header_id|>user<|end_header_id|>
\n\n
"
,
"<|eot_id|>"
),
"gemma2"
:
(
"<start_of_turn>user
\n
"
,
"<end_of_turn>
\n
"
),
}
for
start
,
end
in
prompt_start_and_end
.
values
():
if
start
in
text
and
end
in
text
:
text
=
text
.
split
(
start
)[
1
].
split
(
end
)[
0
]
break
prompt
,
input_ids
,
pixel_values
=
hf_model
.
model
.
preprocess_inputs
(
text_or_conversations
=
text
,
images
=
images
)
attention_mask
=
torch
.
ne
(
input_ids
,
text_tokenizer
.
pad_token_id
)
inputs
=
{
"inputs"
:
input_ids
.
unsqueeze
(
0
),
"pixel_values"
:
pixel_values
.
unsqueeze
(
0
),
"attention_mask"
:
attention_mask
.
unsqueeze
(
0
),
}
return
BatchFeature
(
data
=
inputs
,
tensor_type
=
"pt"
)
hf_model
.
processor
=
processor
return
hf_model
def
qwen2_5_omni_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
thinker
=
hf_model
.
model
.
thinker
thinker
.
get_output_embeddings
=
lambda
:
thinker
.
lm_head
hf_model
.
model
=
thinker
return
hf_model
tests/models/
decoder_only/vision_language
/vlm_utils/runners.py
→
tests/models/
multimodal/generation
/vlm_utils/runners.py
View file @
7a985548
...
@@ -4,7 +4,8 @@ types / modalities.
...
@@ -4,7 +4,8 @@ types / modalities.
"""
"""
from
pathlib
import
PosixPath
from
pathlib
import
PosixPath
from
.....conftest
import
HfRunner
,
VllmRunner
,
_ImageAssets
,
_VideoAssets
from
.....conftest
import
(
AudioTestAssets
,
HfRunner
,
ImageTestAssets
,
VideoTestAssets
,
VllmRunner
)
from
.
import
builders
,
core
from
.
import
builders
,
core
from
.types
import
ExpandableVLMTestArgs
,
VLMTestInfo
from
.types
import
ExpandableVLMTestArgs
,
VLMTestInfo
...
@@ -14,7 +15,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
...
@@ -14,7 +15,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_
ImageAssets
):
image_assets
:
Image
Test
Assets
):
assert
test_case
.
size_wrapper
is
not
None
assert
test_case
.
size_wrapper
is
not
None
inputs
=
builders
.
build_single_image_inputs_from_test_info
(
inputs
=
builders
.
build_single_image_inputs_from_test_info
(
model_test_info
,
image_assets
,
test_case
.
size_wrapper
,
tmp_path
)
model_test_info
,
image_assets
,
test_case
.
size_wrapper
,
tmp_path
)
...
@@ -29,7 +30,6 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
...
@@ -29,7 +30,6 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs
=
test_case
.
num_logprobs
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
"image"
:
1
},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
"images"
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
...
@@ -37,7 +37,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
...
@@ -37,7 +37,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_
ImageAssets
):
image_assets
:
Image
Test
Assets
):
assert
test_case
.
size_wrapper
is
not
None
assert
test_case
.
size_wrapper
is
not
None
inputs
=
builders
.
build_multi_image_inputs_from_test_info
(
inputs
=
builders
.
build_multi_image_inputs_from_test_info
(
model_test_info
,
image_assets
,
test_case
.
size_wrapper
,
tmp_path
)
model_test_info
,
image_assets
,
test_case
.
size_wrapper
,
tmp_path
)
...
@@ -52,7 +52,6 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
...
@@ -52,7 +52,6 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs
=
test_case
.
num_logprobs
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_assets
)},
limit_mm_per_prompt
=
{
"image"
:
len
(
image_assets
)},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
"images"
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
...
@@ -60,7 +59,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
...
@@ -60,7 +59,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_
ImageAssets
):
image_assets
:
Image
Test
Assets
):
assert
test_case
.
size_wrapper
is
not
None
assert
test_case
.
size_wrapper
is
not
None
inputs
,
vllm_embeddings
=
builders
.
build_embedding_inputs_from_test_info
(
inputs
,
vllm_embeddings
=
builders
.
build_embedding_inputs_from_test_info
(
model_test_info
,
image_assets
,
test_case
.
size_wrapper
)
model_test_info
,
image_assets
,
test_case
.
size_wrapper
)
...
@@ -76,7 +75,6 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
...
@@ -76,7 +75,6 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
"image"
:
1
},
vllm_embeddings
=
vllm_embeddings
,
vllm_embeddings
=
vllm_embeddings
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
"images"
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
...
@@ -86,7 +84,7 @@ def run_video_test(
...
@@ -86,7 +84,7 @@ def run_video_test(
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
video_assets
:
_
VideoAssets
,
video_assets
:
Video
Test
Assets
,
):
):
assert
test_case
.
size_wrapper
is
not
None
assert
test_case
.
size_wrapper
is
not
None
assert
test_case
.
num_video_frames
is
not
None
assert
test_case
.
num_video_frames
is
not
None
...
@@ -104,7 +102,30 @@ def run_video_test(
...
@@ -104,7 +102,30 @@ def run_video_test(
num_logprobs
=
test_case
.
num_logprobs
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"video"
:
len
(
video_assets
)},
limit_mm_per_prompt
=
{
"video"
:
len
(
video_assets
)},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
"videos"
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
def
run_audio_test
(
*
,
model_test_info
:
VLMTestInfo
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
audio_assets
:
AudioTestAssets
,
):
inputs
=
builders
.
build_audio_inputs_from_test_info
(
model_test_info
,
audio_assets
)
core
.
run_test
(
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
inputs
=
inputs
,
model
=
test_case
.
model
,
dtype
=
test_case
.
dtype
,
max_tokens
=
test_case
.
max_tokens
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"audio"
:
1
},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
...
@@ -119,11 +140,9 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
...
@@ -119,11 +140,9 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
inputs
=
test_case
.
custom_test_opts
.
inputs
inputs
=
test_case
.
custom_test_opts
.
inputs
limit_mm_per_prompt
=
test_case
.
custom_test_opts
.
limit_mm_per_prompt
limit_mm_per_prompt
=
test_case
.
custom_test_opts
.
limit_mm_per_prompt
runner_mm_key
=
test_case
.
custom_test_opts
.
runner_mm_key
# Inputs and limit_mm_per_prompt should all be set
# Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
assert
inputs
is
not
None
assert
inputs
is
not
None
assert
limit_mm_per_prompt
is
not
None
assert
limit_mm_per_prompt
is
not
None
assert
runner_mm_key
is
not
None
core
.
run_test
(
core
.
run_test
(
hf_runner
=
hf_runner
,
hf_runner
=
hf_runner
,
...
@@ -135,5 +154,4 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
...
@@ -135,5 +154,4 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
num_logprobs
=
test_case
.
num_logprobs
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
runner_mm_key
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
tests/models/
decoder_only/vision_language
/vlm_utils/types.py
→
tests/models/
multimodal/generation
/vlm_utils/types.py
View file @
7a985548
...
@@ -6,7 +6,6 @@ from pathlib import PosixPath
...
@@ -6,7 +6,6 @@ from pathlib import PosixPath
from
typing
import
Any
,
Callable
,
NamedTuple
,
Optional
,
Union
from
typing
import
Any
,
Callable
,
NamedTuple
,
Optional
,
Union
import
torch
import
torch
from
PIL.Image
import
Image
from
pytest
import
MarkDecorator
from
pytest
import
MarkDecorator
from
transformers
import
AutoModelForCausalLM
from
transformers
import
AutoModelForCausalLM
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
...
@@ -15,18 +14,25 @@ from vllm.config import TaskOption
...
@@ -15,18 +14,25 @@ from vllm.config import TaskOption
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
from
.....conftest
import
(
AUDIO_ASSETS
,
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
ImageTestAssets
,
PromptAudioInput
,
PromptImageInput
,
PromptVideoInput
)
from
....utils
import
check_logprobs_close
from
....utils
import
check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
# meta image tag; will be replaced by the appropriate tag for the model
TEST_IMG_PLACEHOLDER
=
"<vlm_image>"
TEST_IMG_PLACEHOLDER
=
"<vlm_image>"
TEST_VIDEO_PLACEHOLDER
=
"<vlm_video>"
TEST_VIDEO_PLACEHOLDER
=
"<vlm_video>"
TEST_AUDIO_PLACEHOLDER
=
"<lmm_audio>"
# yapf: disable
# yapf: disable
SINGLE_IMAGE_BASE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
SINGLE_IMAGE_BASE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
f
"
{
TEST_IMG_PLACEHOLDER
}
What's the content of the image?"
,
"stop_sign"
:
f
"
{
TEST_IMG_PLACEHOLDER
}
What's the content of the image?"
,
"cherry_blossom"
:
f
"
{
TEST_IMG_PLACEHOLDER
}
What is the season?"
,
"cherry_blossom"
:
f
"
{
TEST_IMG_PLACEHOLDER
}
What is the season?"
,
})
})
SINGLE_AUDIO_BASE_PROMPT
=
AUDIO_ASSETS
.
prompts
({
"mary_had_lamb"
:
f
"
{
TEST_AUDIO_PLACEHOLDER
}
Transcribe this audio into English."
,
# noqa: E501
"winning_call"
:
f
"
{
TEST_AUDIO_PLACEHOLDER
}
What is happening in this audio clip?"
,
# noqa: E501
})
MULTI_IMAGE_BASE_PROMPT
=
f
"Image-1:
{
TEST_IMG_PLACEHOLDER
}
Image-2:
{
TEST_IMG_PLACEHOLDER
}
Describe the two images in detail.
\n
"
# noqa: E501
MULTI_IMAGE_BASE_PROMPT
=
f
"Image-1:
{
TEST_IMG_PLACEHOLDER
}
Image-2:
{
TEST_IMG_PLACEHOLDER
}
Describe the two images in detail.
\n
"
# noqa: E501
VIDEO_BASE_PROMPT
=
f
"
{
TEST_VIDEO_PLACEHOLDER
}
Why is this video funny?"
VIDEO_BASE_PROMPT
=
f
"
{
TEST_VIDEO_PLACEHOLDER
}
Why is this video funny?"
...
@@ -38,12 +44,21 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
...
@@ -38,12 +44,21 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
# yapf: enable
# yapf: enable
class
PromptWithMultiModalInput
(
NamedTuple
):
"""Holds the multimodal input for a single test case."""
prompts
:
list
[
str
]
image_data
:
Optional
[
PromptImageInput
]
=
None
video_data
:
Optional
[
PromptVideoInput
]
=
None
audio_data
:
Optional
[
PromptAudioInput
]
=
None
class
VLMTestType
(
Enum
):
class
VLMTestType
(
Enum
):
IMAGE
=
1
IMAGE
=
1
MULTI_IMAGE
=
2
MULTI_IMAGE
=
2
EMBEDDING
=
3
EMBEDDING
=
3
VIDEO
=
4
VIDEO
=
4
CUSTOM_INPUTS
=
5
AUDIO
=
5
CUSTOM_INPUTS
=
6
class
SizeType
(
Enum
):
class
SizeType
(
Enum
):
...
@@ -52,10 +67,8 @@ class SizeType(Enum):
...
@@ -52,10 +67,8 @@ class SizeType(Enum):
class
CustomTestOptions
(
NamedTuple
):
class
CustomTestOptions
(
NamedTuple
):
inputs
:
list
[
tuple
[
list
[
str
],
list
[
Union
[
list
[
Image
],
Image
]]]
]
inputs
:
list
[
PromptWithMultiModalInput
]
limit_mm_per_prompt
:
dict
[
str
,
int
]
limit_mm_per_prompt
:
dict
[
str
,
int
]
# kwarg to pass multimodal data in as to vllm/hf runner instances.
runner_mm_key
:
str
=
"images"
class
ImageSizeWrapper
(
NamedTuple
):
class
ImageSizeWrapper
(
NamedTuple
):
...
@@ -75,6 +88,7 @@ class VLMTestInfo(NamedTuple):
...
@@ -75,6 +88,7 @@ class VLMTestInfo(NamedTuple):
prompt_formatter
:
Optional
[
Callable
[[
str
],
str
]]
=
None
prompt_formatter
:
Optional
[
Callable
[[
str
],
str
]]
=
None
img_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<image>
\n
"
img_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<image>
\n
"
video_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<video>
\n
"
video_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<video>
\n
"
audio_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<audio>
\n
"
# Most models work on the single / multi-image prompts above, but in some
# Most models work on the single / multi-image prompts above, but in some
# cases the log prob check fails, e.g., for paligemma. We allow passing
# cases the log prob check fails, e.g., for paligemma. We allow passing
...
@@ -85,7 +99,7 @@ class VLMTestInfo(NamedTuple):
...
@@ -85,7 +99,7 @@ class VLMTestInfo(NamedTuple):
# Function for converting ImageAssets to image embeddings;
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings
:
Optional
[
Callable
[[
_
ImageAssets
],
convert_assets_to_embeddings
:
Optional
[
Callable
[[
Image
Test
Assets
],
torch
.
Tensor
]]
=
None
torch
.
Tensor
]]
=
None
# Exposed options for vLLM runner; we change these in a several tests,
# Exposed options for vLLM runner; we change these in a several tests,
...
@@ -141,7 +155,7 @@ class VLMTestInfo(NamedTuple):
...
@@ -141,7 +155,7 @@ class VLMTestInfo(NamedTuple):
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
# for HF runner
prompt_path_encoder
:
Optional
[
prompt_path_encoder
:
Optional
[
Callable
[[
PosixPath
,
str
,
Union
[
list
[
ImageAsset
],
_
ImageAssets
]],
Callable
[[
PosixPath
,
str
,
Union
[
list
[
ImageAsset
],
Image
Test
Assets
]],
str
]]
=
None
# noqa: E501
str
]]
=
None
# noqa: E501
# Allows configuring a test to run with custom inputs
# Allows configuring a test to run with custom inputs
...
...
tests/models/
embedd
ing/__init__.py
→
tests/models/
multimodal/pool
ing/__init__.py
View file @
7a985548
File moved
tests/models/
embedding/vision_language
/test_dse_qwen2_vl.py
→
tests/models/
multimodal/pooling
/test_dse_qwen2_vl.py
View file @
7a985548
...
@@ -10,7 +10,7 @@ from transformers import Qwen2VLForConditionalGeneration
...
@@ -10,7 +10,7 @@ from transformers import Qwen2VLForConditionalGeneration
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
large_gpu_test
from
....utils
import
large_gpu_test
from
..utils
import
check_embeddings_close
from
..
.
utils
import
check_embeddings_close
HF_TEXT_PROMPTS
=
[
HF_TEXT_PROMPTS
=
[
# T -> X
# T -> X
...
...
tests/models/
decoder_only/vision_language
/test_intern_vit.py
→
tests/models/
multimodal/pooling
/test_intern_vit.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
import
pytest
import
pytest
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
transformers
import
AutoConfig
,
AutoModel
,
CLIPImageProcessor
from
transformers
import
AutoConfig
,
AutoModel
,
CLIPImageProcessor
from
....conftest
import
_ImageAssets
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
ImageTestAssets
# we use snapshot_download to prevent conflicts between
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN
=
[
"*.json"
,
"*.py"
,
"*.safetensors"
,
"*.txt"
,
"*.model"
]
DOWNLOAD_PATTERN
=
[
"*.json"
,
"*.py"
,
"*.safetensors"
,
"*.txt"
,
"*.model"
]
@
torch
.
inference_mode
()
def
run_intern_vit_test
(
def
run_intern_vit_test
(
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
model_id
:
str
,
model_id
:
str
,
*
,
*
,
dtype
:
str
,
dtype
:
str
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
):
model
=
snapshot_download
(
model_id
,
allow_patterns
=
DOWNLOAD_PATTERN
)
model
=
snapshot_download
(
model_id
,
allow_patterns
=
DOWNLOAD_PATTERN
)
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
img_processor
=
CLIPImageProcessor
.
from_pretrained
(
model
)
img_processor
=
CLIPImageProcessor
.
from_pretrained
(
model
)
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
pixel_values
=
[
pixel_values
=
[
img_processor
(
images
,
return_tensors
=
'pt'
).
pixel_values
.
to
(
dtype
)
img_processor
(
images
,
return_tensors
=
'pt'
).
pixel_values
.
to
(
torch_
dtype
)
for
images
in
images
for
images
in
images
]
]
...
@@ -36,14 +37,13 @@ def run_intern_vit_test(
...
@@ -36,14 +37,13 @@ def run_intern_vit_test(
config
.
norm_type
=
"rms_norm"
config
.
norm_type
=
"rms_norm"
hf_model
=
AutoModel
.
from_pretrained
(
model
,
hf_model
=
AutoModel
.
from_pretrained
(
model
,
torch_dtype
=
dtype
,
torch_dtype
=
torch_
dtype
,
trust_remote_code
=
True
).
to
(
"cuda"
)
trust_remote_code
=
True
).
to
(
"cuda"
)
hf_outputs_per_image
=
[
hf_outputs_per_image
=
[
hf_model
(
pixel_value
.
to
(
"cuda"
)).
last_hidden_state
hf_model
(
pixel_value
.
to
(
"cuda"
)).
last_hidden_state
for
pixel_value
in
pixel_values
for
pixel_value
in
pixel_values
]
]
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.model_executor.models.intern_vit
import
InternVisionModel
from
vllm.model_executor.models.intern_vit
import
InternVisionModel
vllm_model
=
InternVisionModel
(
config
)
vllm_model
=
InternVisionModel
(
config
)
vllm_model
.
load_weights
(
hf_model
.
state_dict
().
items
())
vllm_model
.
load_weights
(
hf_model
.
state_dict
().
items
())
...
@@ -51,7 +51,7 @@ def run_intern_vit_test(
...
@@ -51,7 +51,7 @@ def run_intern_vit_test(
del
hf_model
del
hf_model
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
vllm_model
=
vllm_model
.
to
(
"cuda"
,
dtype
)
vllm_model
=
vllm_model
.
to
(
"cuda"
,
torch_
dtype
)
vllm_outputs_per_image
=
[
vllm_outputs_per_image
=
[
vllm_model
(
pixel_values
=
pixel_value
.
to
(
"cuda"
))
vllm_model
(
pixel_values
=
pixel_value
.
to
(
"cuda"
))
for
pixel_value
in
pixel_values
for
pixel_value
in
pixel_values
...
@@ -69,8 +69,7 @@ def run_intern_vit_test(
...
@@ -69,8 +69,7 @@ def run_intern_vit_test(
"OpenGVLab/InternViT-300M-448px"
,
"OpenGVLab/InternViT-300M-448px"
,
"OpenGVLab/InternViT-6B-448px-V1-5"
,
"OpenGVLab/InternViT-6B-448px-V1-5"
,
])
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
half
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
torch
.
inference_mode
()
def
test_models
(
dist_init
,
image_assets
,
model_id
,
dtype
:
str
)
->
None
:
def
test_models
(
dist_init
,
image_assets
,
model_id
,
dtype
:
str
)
->
None
:
run_intern_vit_test
(
run_intern_vit_test
(
image_assets
,
image_assets
,
...
...
tests/models/
embedding/vision_language
/test_llava_next.py
→
tests/models/
multimodal/pooling
/test_llava_next.py
View file @
7a985548
...
@@ -8,7 +8,7 @@ from vllm.platforms import current_platform
...
@@ -8,7 +8,7 @@ from vllm.platforms import current_platform
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
large_gpu_test
from
....utils
import
large_gpu_test
from
..utils
import
check_embeddings_close
from
..
.
utils
import
check_embeddings_close
# Llava Next embedding implementation is only supported by CUDA.
# Llava Next embedding implementation is only supported by CUDA.
# If run on ROCm, hf_model.model.resize_token_embeddings will
# If run on ROCm, hf_model.model.resize_token_embeddings will
...
...
tests/models/
embedding/vision_language
/test_phi3v.py
→
tests/models/
multimodal/pooling
/test_phi3v.py
View file @
7a985548
...
@@ -9,7 +9,7 @@ from vllm.assets.image import VLM_IMAGES_DIR
...
@@ -9,7 +9,7 @@ from vllm.assets.image import VLM_IMAGES_DIR
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
large_gpu_test
from
....utils
import
large_gpu_test
from
..utils
import
check_embeddings_close
from
..
.
utils
import
check_embeddings_close
HF_TEXT_PROMPTS
=
[
HF_TEXT_PROMPTS
=
[
# T -> X
# T -> X
...
...
tests/models/multimodal/processing/test_common.py
View file @
7a985548
...
@@ -146,7 +146,8 @@ def _test_processing_correctness_hf(
...
@@ -146,7 +146,8 @@ def _test_processing_correctness_hf(
batch_idx
:
int
,
batch_idx
:
int
,
ignore_mm_keys
:
Optional
[
set
[
str
]]
=
None
,
ignore_mm_keys
:
Optional
[
set
[
str
]]
=
None
,
):
):
if
model_config
.
hf_config
.
model_type
in
(
"mllama"
,
"whisper"
,
"ultravox"
):
if
model_config
.
hf_config
.
model_type
in
(
"mllama"
,
"ovis"
,
"ultravox"
,
"whisper"
):
# For some multimodal models, tokenizer will always add bos_token
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# incorrect token ids. So we need use `add_special_tokens=False` here
...
@@ -270,9 +271,13 @@ def _test_processing_correctness_mistral(
...
@@ -270,9 +271,13 @@ def _test_processing_correctness_mistral(
"openbmb/MiniCPM-Llama3-V-2_5"
,
"openbmb/MiniCPM-Llama3-V-2_5"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"MiniMaxAI/MiniMax-VL-01"
,
"allenai/Molmo-7B-D-0924"
,
"allenai/Molmo-7B-D-0924"
,
"allenai/Molmo-7B-O-0924"
,
"allenai/Molmo-7B-O-0924"
,
"nvidia/NVLM-D-72B"
,
"nvidia/NVLM-D-72B"
,
"AIDC-AI/Ovis1.6-Gemma2-9B"
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
,
"AIDC-AI/Ovis2-1B"
,
"google/paligemma-3b-mix-224"
,
"google/paligemma-3b-mix-224"
,
"google/paligemma2-3b-ft-docci-448"
,
"google/paligemma2-3b-ft-docci-448"
,
"microsoft/Phi-4-multimodal-instruct"
,
"microsoft/Phi-4-multimodal-instruct"
,
...
@@ -282,7 +287,7 @@ def _test_processing_correctness_mistral(
...
@@ -282,7 +287,7 @@ def _test_processing_correctness_mistral(
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"Qwen/Qwen2.5-Omni-
7
B"
,
"Qwen/Qwen2.5-Omni-
3
B"
,
"Skywork/Skywork-R1V-38B"
,
"Skywork/Skywork-R1V-38B"
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"openai/whisper-large-v3"
,
"openai/whisper-large-v3"
,
...
...
tests/models/multimodal/processing/test_h2ovl.py
View file @
7a985548
...
@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
...
@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
....conftest
import
_
ImageAssets
from
....conftest
import
Image
Test
Assets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -137,7 +137,7 @@ def _run_check(
...
@@ -137,7 +137,7 @@ def _run_check(
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
model_id
:
str
,
model_id
:
str
,
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
size_factors
:
list
[
int
],
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
...
...
tests/models/multimodal/processing/test_idefics3.py
View file @
7a985548
...
@@ -5,7 +5,7 @@ from transformers import Idefics3Config
...
@@ -5,7 +5,7 @@ from transformers import Idefics3Config
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
_
ImageAssets
from
....conftest
import
Image
Test
Assets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -21,7 +21,7 @@ from ...utils import build_model_context
...
@@ -21,7 +21,7 @@ from ...utils import build_model_context
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
model_id
:
str
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
expected_toks_per_img
:
int
,
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
7a985548
...
@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
...
@@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
....conftest
import
_
ImageAssets
from
....conftest
import
Image
Test
Assets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -94,7 +94,7 @@ def _run_check(
...
@@ -94,7 +94,7 @@ def _run_check(
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
model_id
:
str
,
model_id
:
str
,
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
size_factors
:
list
[
int
],
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
...
...
tests/models/multimodal/processing/test_llama4.py
View file @
7a985548
...
@@ -6,7 +6,7 @@ import pytest
...
@@ -6,7 +6,7 @@ import pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.transformers_utils.tokenizer
import
encode_tokens
from
vllm.transformers_utils.tokenizer
import
encode_tokens
from
....conftest
import
_
ImageAssets
from
....conftest
import
Image
Test
Assets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -17,7 +17,7 @@ from ...utils import build_model_context
...
@@ -17,7 +17,7 @@ from ...utils import build_model_context
@
pytest
.
mark
.
parametrize
(
"disable_mm_preprocessor_cache"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"disable_mm_preprocessor_cache"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"tokenized_prompt"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"tokenized_prompt"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
model_id
:
str
,
model_id
:
str
,
mm_processor_kwargs
:
dict
,
mm_processor_kwargs
:
dict
,
num_imgs
:
int
,
num_imgs
:
int
,
...
...
tests/models/multimodal/processing/test_minimax_vl_01.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
pytest
from
PIL
import
Image
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
....conftest
import
ImageTestAssets
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"MiniMaxAI/MiniMax-VL-01"
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
image_assets
:
ImageTestAssets
,
model_id
:
str
,
num_imgs
:
int
,
):
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
prompt
=
"<image>"
*
num_imgs
image
=
Image
.
new
(
"RGB"
,
size
=
(
364
,
364
))
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
{})
image_placeholders
=
processed_inputs
[
"mm_placeholders"
][
"image"
]
assert
len
(
image_placeholders
)
==
num_imgs
def
_validate_image_prompt_replacements_one
(
processor
:
BaseMultiModalProcessor
,
num_imgs
:
int
,
failed_size_excs
:
list
[
tuple
[
ImageSize
,
Exception
]],
image_size
:
ImageSize
,
)
->
None
:
prompt
=
"<image>"
*
num_imgs
image
=
Image
.
new
(
"RGB"
,
size
=
image_size
)
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
try
:
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
{})
image_placeholders
=
processed_inputs
[
"mm_placeholders"
][
"image"
]
assert
len
(
image_placeholders
)
==
num_imgs
except
Exception
as
exc
:
failed_size_excs
.
append
((
image_size
,
exc
))
def
_test_image_prompt_replacements
(
processor
,
*
,
num_imgs
:
int
,
image_sizes
:
list
[
ImageSize
],
)
->
None
:
failed_size_excs
=
list
[
tuple
[
ImageSize
,
Exception
]]()
for
size
in
image_sizes
:
_validate_image_prompt_replacements_one
(
processor
,
num_imgs
,
failed_size_excs
,
size
)
if
failed_size_excs
:
msg
=
"Found failing image sizes:"
\
+
"
\n
========
\n
"
.
join
(
f
"[
{
size
}
]
\n
{
exc
}
"
for
size
,
exc
in
failed_size_excs
)
raise
AssertionError
(
msg
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"MiniMaxAI/MiniMax-VL-01"
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_prompt_replacements_regression
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
(
488
,
183
),
(
2560
,
1669
)]
image_sizes
=
[
size
for
w
,
h
in
image_ratios
for
size
in
[
ImageSize
(
w
,
h
),
ImageSize
(
h
,
w
)]
]
_test_image_prompt_replacements
(
processor
,
num_imgs
=
num_imgs
,
image_sizes
=
image_sizes
,
)
tests/models/multimodal/processing/test_phi3v.py
View file @
7a985548
...
@@ -4,7 +4,7 @@ import pytest
...
@@ -4,7 +4,7 @@ import pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
_
ImageAssets
from
....conftest
import
Image
Test
Assets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -22,7 +22,7 @@ from ...utils import build_model_context
...
@@ -22,7 +22,7 @@ from ...utils import build_model_context
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
model_id
:
str
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
int
],
mm_processor_kwargs
:
dict
[
str
,
int
],
expected_toks_per_img
:
int
,
expected_toks_per_img
:
int
,
...
...
tests/models/multimodal/processing/test_phi4mm.py
View file @
7a985548
...
@@ -4,7 +4,7 @@ import pytest
...
@@ -4,7 +4,7 @@ import pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
_
ImageAssets
from
....conftest
import
Image
Test
Assets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -22,7 +22,7 @@ from ...utils import build_model_context
...
@@ -22,7 +22,7 @@ from ...utils import build_model_context
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
model_id
:
str
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
int
],
mm_processor_kwargs
:
dict
[
str
,
int
],
expected_toks_per_img
:
int
,
expected_toks_per_img
:
int
,
...
...
Prev
1
…
17
18
19
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment