Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
390ec889
Unverified
Commit
390ec889
authored
May 16, 2025
by
Isotr0py
Committed by
GitHub
May 16, 2025
Browse files
[Misc] Consolidate Audio tests into multimodal common generation tests (#18214)
Signed-off-by:
Isotr0py
<
2037008807@qq.com
>
parent
54181767
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
282 additions
and
215 deletions
+282
-215
tests/models/multimodal/generation/test_common.py
tests/models/multimodal/generation/test_common.py
+60
-4
tests/models/multimodal/generation/test_ultravox.py
tests/models/multimodal/generation/test_ultravox.py
+3
-109
tests/models/multimodal/generation/vlm_utils/builders.py
tests/models/multimodal/generation/vlm_utils/builders.py
+96
-37
tests/models/multimodal/generation/vlm_utils/case_filtering.py
.../models/multimodal/generation/vlm_utils/case_filtering.py
+4
-4
tests/models/multimodal/generation/vlm_utils/core.py
tests/models/multimodal/generation/vlm_utils/core.py
+18
-11
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
...s/models/multimodal/generation/vlm_utils/custom_inputs.py
+42
-34
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+12
-0
tests/models/multimodal/generation/vlm_utils/runners.py
tests/models/multimodal/generation/vlm_utils/runners.py
+27
-10
tests/models/multimodal/generation/vlm_utils/types.py
tests/models/multimodal/generation/vlm_utils/types.py
+20
-6
No files found.
tests/models/multimodal/generation/test_common.py
View file @
390ec889
...
...
@@ -8,14 +8,14 @@ from collections import defaultdict
from
pathlib
import
PosixPath
import
pytest
from
transformers
import
(
AutoModelForImageTextToText
,
from
transformers
import
(
AutoModel
,
AutoModelForImageTextToText
,
AutoModelForTextToWaveform
,
AutoModelForVision2Seq
)
from
vllm.platforms
import
current_platform
from
vllm.utils
import
identity
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
Image
TestAssets
,
VideoTestAssets
,
VllmRunner
)
from
....conftest
import
(
IMAGE_ASSETS
,
Audio
TestAssets
,
HfRunner
,
ImageTestAssets
,
VideoTestAssets
,
VllmRunner
)
from
....utils
import
(
create_new_process_for_each_test
,
large_gpu_mark
,
multi_gpu_marks
)
from
...utils
import
check_outputs_equal
...
...
@@ -158,6 +158,17 @@ VLM_TEST_SETTINGS = {
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
"ultravox"
:
VLMTestInfo
(
models
=
[
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
],
test_type
=
VLMTestType
.
AUDIO
,
prompt_formatter
=
lambda
audio_prompt
:
f
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
{
audio_prompt
}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
audio_idx_to_prompt
=
lambda
idx
:
"<|audio|>"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModel
,
hf_output_post_proc
=
model_utils
.
ultravox_trunc_hf_output
,
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
#### Extended model tests
"aria"
:
VLMTestInfo
(
models
=
[
"rhymes-ai/Aria"
],
...
...
@@ -393,7 +404,6 @@ VLM_TEST_SETTINGS = {
formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
),
limit_mm_per_prompt
=
{
"video"
:
4
},
runner_mm_key
=
"videos"
,
)],
),
"llava_next_video"
:
VLMTestInfo
(
...
...
@@ -706,6 +716,7 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
# - multi-image
# - image embeddings
# - video
# - audio
# - custom inputs
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
...
...
@@ -803,6 +814,28 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
AUDIO
,
create_new_process_for_each_test
=
False
,
))
def
test_audio_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
audio_assets
:
AudioTestAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_audio_test
(
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
audio_assets
=
audio_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
...
...
@@ -930,6 +963,29 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
AUDIO
,
create_new_process_for_each_test
=
True
,
))
def
test_audio_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
audio_assets
:
AudioTestAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_audio_test
(
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
audio_assets
=
audio_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
...
...
tests/models/multimodal/generation/test_ultravox.py
View file @
390ec889
# SPDX-License-Identifier: Apache-2.0
import
json
from
typing
import
Any
,
Optional
from
typing
import
Any
import
numpy
as
np
import
pytest
import
pytest_asyncio
from
transformers
import
AutoModel
,
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm.multimodal.audio
import
resample_audio_librosa
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
AUDIO_ASSETS
,
AudioTestAssets
,
HfRunner
,
VllmRunner
from
....conftest
import
AUDIO_ASSETS
,
AudioTestAssets
,
VllmRunner
from
....utils
import
RemoteOpenAIServer
from
...registry
import
HF_EXAMPLE_MODELS
from
...utils
import
check_logprobs_close
MODEL_NAME
=
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
...
...
@@ -88,79 +84,6 @@ def _get_prompt(audio_count, question, placeholder):
add_generation_prompt
=
True
)
def
vllm_to_hf_output
(
vllm_output
:
tuple
[
list
[
int
],
str
,
Optional
[
SampleLogprobs
]],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
hf_output_ids
=
output_ids
[:]
hf_output_str
=
output_str
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
prompts_and_audios
:
list
[
tuple
[
str
,
str
,
AudioTuple
]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
**
kwargs
,
):
"""Inference result should be the same between hf and vllm."""
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
,
**
kwargs
)
as
vllm_model
:
vllm_outputs_per_audio
=
[
vllm_model
.
generate_greedy_logprobs
([
vllm_prompt
],
max_tokens
,
num_logprobs
=
num_logprobs
,
audios
=
[
audio
])
for
vllm_prompt
,
_
,
audio
in
prompts_and_audios
]
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_outputs_per_audio
=
[
hf_model
.
generate_greedy_logprobs_limit
(
[
hf_prompt
],
max_tokens
,
num_logprobs
=
num_logprobs
,
audios
=
[(
resample_audio_librosa
(
audio
[
0
],
orig_sr
=
audio
[
1
],
target_sr
=
16000
),
16000
)])
for
_
,
hf_prompt
,
audio
in
prompts_and_audios
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_audio
,
vllm_outputs_per_audio
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
def
run_multi_audio_test
(
vllm_runner
:
type
[
VllmRunner
],
prompts_and_audios
:
list
[
tuple
[
str
,
list
[
AudioTuple
]]],
...
...
@@ -194,35 +117,6 @@ def run_multi_audio_test(
assert
all
(
tokens
for
tokens
,
*
_
in
vllm_outputs
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"vllm_kwargs"
,
[
pytest
.
param
({},
marks
=
pytest
.
mark
.
cpu_model
),
pytest
.
param
(
CHUNKED_PREFILL_KWARGS
),
])
def
test_models
(
hf_runner
,
vllm_runner
,
audio_assets
:
AudioTestAssets
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
vllm_kwargs
:
dict
)
->
None
:
audio_inputs
=
[(
_get_prompt
(
1
,
audio
,
VLLM_PLACEHOLDER
),
_get_prompt
(
1
,
audio
,
HF_PLACEHOLDER
),
audio
.
audio_and_sample_rate
,
)
for
audio
in
audio_assets
]
run_test
(
hf_runner
,
vllm_runner
,
audio_inputs
,
MODEL_NAME
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
**
vllm_kwargs
,
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
...
...
tests/models/multimodal/generation/vlm_utils/builders.py
View file @
390ec889
...
...
@@ -7,17 +7,20 @@ from typing import Callable, Optional, Union
import
torch
from
vllm.multimodal.audio
import
AudioResampler
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.video
import
(
rescale_video_size
,
resize_video
,
sample_frames_from_video
)
from
.....conftest
import
ImageTestAssets
,
VideoTestAssets
from
.types
import
(
SINGLE_IMAGE_BASE_PROMPTS
,
TEST_IMG_PLACEHOLDER
,
from
.....conftest
import
AudioTestAssets
,
ImageTestAssets
,
VideoTestAssets
from
.types
import
(
SINGLE_AUDIO_BASE_PROMPT
,
SINGLE_IMAGE_BASE_PROMPTS
,
TEST_AUDIO_PLACEHOLDER
,
TEST_IMG_PLACEHOLDER
,
TEST_VIDEO_PLACEHOLDER
,
VIDEO_BASE_PROMPT
,
ImageSizeWrapper
,
SizeType
,
VLMTestInfo
)
ImageSizeWrapper
,
PromptWithMultiModalInput
,
SizeType
,
VLMTestInfo
)
def
replace_test_placeholder
(
prompt
:
str
,
img
_idx_to_prompt
:
Callable
[[
int
],
def
replace_test_placeholder
(
prompt
:
str
,
mm
_idx_to_prompt
:
Callable
[[
int
],
str
],
test_placeholder
:
str
)
->
str
:
"""Given a prompt, replaces each test placeholder with the
...
...
@@ -26,7 +29,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
prompt_segments
=
prompt
.
split
(
test_placeholder
)
img_prompt
=
prompt_segments
[
0
]
for
placeholder_idx
,
next_seg
in
enumerate
(
prompt_segments
[
1
:],
start
=
1
):
img_prompt
+=
img
_idx_to_prompt
(
placeholder_idx
)
img_prompt
+=
mm
_idx_to_prompt
(
placeholder_idx
)
img_prompt
+=
next_seg
return
img_prompt
...
...
@@ -34,6 +37,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
def
get_model_prompts
(
base_prompts
:
Iterable
[
str
],
img_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
video_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
audio_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
prompt_formatter
:
Callable
[[
str
],
str
])
->
list
[
str
]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
...
...
@@ -60,6 +64,11 @@ def get_model_prompts(base_prompts: Iterable[str],
video_idx_to_prompt
,
TEST_VIDEO_PLACEHOLDER
)
if
audio_idx_to_prompt
:
base_prompt
=
replace_test_placeholder
(
base_prompt
,
audio_idx_to_prompt
,
TEST_AUDIO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
model_prompt
=
prompt_formatter
(
base_prompt
)
...
...
@@ -71,7 +80,8 @@ def build_single_image_inputs_from_test_info(
test_info
:
VLMTestInfo
,
image_assets
:
ImageTestAssets
,
size_wrapper
:
ImageSizeWrapper
,
tmp_path
:
Optional
[
PosixPath
]
=
None
):
tmp_path
:
Optional
[
PosixPath
]
=
None
,
)
->
list
[
PromptWithMultiModalInput
]:
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build single image inputs"
)
...
...
@@ -79,6 +89,7 @@ def build_single_image_inputs_from_test_info(
model_prompts
=
get_model_prompts
(
test_info
.
single_image_prompts
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
)
# For models that require a local path / URL encoded in the image; export
...
...
@@ -97,28 +108,32 @@ def build_single_image_inputs_from_test_info(
return
build_single_image_inputs
(
images
,
model_prompts
,
size_wrapper
)
def
build_single_image_inputs
(
images
,
model_prompts
,
size_wrapper
:
ImageSizeWrapper
):
def
build_single_image_inputs
(
images
,
model_prompts
,
size_wrapper
:
ImageSizeWrapper
)
->
list
[
PromptWithMultiModalInput
]:
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
# scaled by one of the size factors.
#
# NOTE: rescaling preserves the image aspect ratio.
return
[(
[
prompt
for
_
in
size_wrapper
.
data
],
[
return
[
PromptWithMultiModalInput
(
prompts
=
[
prompt
for
_
in
size_wrapper
.
data
],
image_data
=
[
apply_image_size_scaling
(
image
,
size
,
size_wrapper
.
type
)
for
size
in
size_wrapper
.
data
],
)
for
image
,
prompt
in
zip
(
images
,
model_prompts
)]
)
for
image
,
prompt
in
zip
(
images
,
model_prompts
)
]
def
build_multi_image_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
image_assets
:
ImageTestAssets
,
size_wrapper
:
ImageSizeWrapper
,
tmp_path
:
Optional
[
PosixPath
]
=
None
):
tmp_path
:
Optional
[
PosixPath
]
=
None
,
)
->
list
[
PromptWithMultiModalInput
]:
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build multi image inputs"
)
...
...
@@ -126,6 +141,7 @@ def build_multi_image_inputs_from_test_info(
model_prompts
=
get_model_prompts
([
test_info
.
multi_image_prompt
],
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
)
if
test_info
.
prompt_path_encoder
is
not
None
:
...
...
@@ -146,15 +162,18 @@ def build_multi_image_inputs_from_test_info(
)
def
build_multi_image_inputs
(
image_lists
,
model_prompts
,
size_wrapper
:
ImageSizeWrapper
):
return
[(
[
prompt
for
_
in
size_wrapper
.
data
],
[[
def
build_multi_image_inputs
(
image_lists
,
model_prompts
,
size_wrapper
:
ImageSizeWrapper
)
->
list
[
PromptWithMultiModalInput
]:
return
[
PromptWithMultiModalInput
(
prompts
=
[
prompt
for
_
in
size_wrapper
.
data
],
image_data
=
[[
apply_image_size_scaling
(
image
,
size
,
size_wrapper
.
type
)
for
image
in
images
]
for
size
in
size_wrapper
.
data
],
)
for
images
,
prompt
in
zip
(
image_lists
,
model_prompts
)]
)
for
images
,
prompt
in
zip
(
image_lists
,
model_prompts
)
]
def
build_embedding_inputs_from_test_info
(
...
...
@@ -177,6 +196,7 @@ def build_embedding_inputs_from_test_info(
SINGLE_IMAGE_BASE_PROMPTS
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
)
...
...
@@ -195,13 +215,14 @@ def build_video_inputs_from_test_info(
video_assets
:
VideoTestAssets
,
size_wrapper
:
ImageSizeWrapper
,
num_frames
:
int
,
):
)
->
list
[
PromptWithMultiModalInput
]
:
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build video inputs"
)
model_prompts
=
get_model_prompts
(
[
VIDEO_BASE_PROMPT
],
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
)
...
...
@@ -213,10 +234,14 @@ def build_video_inputs_from_test_info(
video_scaler
=
(
resize_video
if
size_wrapper
.
type
==
SizeType
.
FIXED_SIZE
else
rescale_video_size
)
return
[(
[
prompt
for
_
in
size_wrapper
.
data
],
[
video_scaler
(
video
,
size
)
for
size
in
size_wrapper
.
data
],
)
for
video
,
prompt
in
zip
(
sampled_vids
,
model_prompts
)]
return
[
PromptWithMultiModalInput
(
prompts
=
[
prompt
for
_
in
size_wrapper
.
data
],
video_data
=
[
video_scaler
(
video
,
size
)
for
size
in
size_wrapper
.
data
],
)
for
video
,
prompt
in
zip
(
sampled_vids
,
model_prompts
)
]
def
apply_image_size_scaling
(
image
,
size
:
Union
[
float
,
tuple
[
int
,
int
]],
...
...
@@ -236,3 +261,37 @@ def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
# We have a list of fixed sizes
return
image
.
resize
(
size
)
raise
ValueError
(
"ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR"
)
def
build_audio_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
audio_assets
:
AudioTestAssets
,
)
->
list
[
PromptWithMultiModalInput
]:
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build audio inputs"
)
model_prompts
=
get_model_prompts
(
SINGLE_AUDIO_BASE_PROMPT
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
)
resampler
=
AudioResampler
(
target_sr
=
16000
,
method
=
"librosa"
,
)
audios
=
[
asset
.
audio_and_sample_rate
for
asset
in
audio_assets
]
resampled_audios
=
[(
resampler
.
resample
(
audio
,
orig_sr
=
sr
,
),
int
(
resampler
.
target_sr
),
)
for
audio
,
sr
in
audios
]
return
[
PromptWithMultiModalInput
(
prompts
=
model_prompts
,
audio_data
=
resampled_audios
,
)
]
tests/models/multimodal/generation/vlm_utils/case_filtering.py
View file @
390ec889
...
...
@@ -83,7 +83,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
test_info
.
num_video_frames
)
# No sizes passed for custom inputs, since inputs are directly provided
if
test_type
!=
VLMTestType
.
CUSTOM_INPUTS
:
if
test_type
not
in
(
VLMTestType
.
CUSTOM_INPUTS
,
VLMTestType
.
AUDIO
)
:
wrapped_sizes
=
get_wrapped_test_sizes
(
test_info
,
test_type
)
if
wrapped_sizes
is
None
:
raise
ValueError
(
...
...
@@ -91,7 +91,7 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
iter_kwargs
[
"size_wrapper"
]
=
wrapped_sizes
#Otherwise expand the custom test options instead
el
se
:
el
if
test_type
==
VLMTestType
.
CUSTOM_INPUTS
:
if
test_info
.
custom_test_opts
is
None
:
raise
ValueError
(
"Test has type CUSTOM_INPUTS, but none given"
)
iter_kwargs
[
"custom_test_opts"
]
=
test_info
.
custom_test_opts
...
...
@@ -136,8 +136,8 @@ def get_wrapped_test_sizes(
ImageSizeWrapper
(
type
=
SizeType
.
SIZE_FACTOR
,
data
=
factor
)
for
factor
in
EMBEDDING_SIZE_FACTORS
])
# Custom inputs have preprocessed inputs
elif
test_type
==
VLMTestType
.
CUSTOM_INPUTS
:
#
Audio and
Custom inputs have preprocessed inputs
elif
test_type
in
(
VLMTestType
.
AUDIO
,
VLMTestType
.
CUSTOM_INPUTS
)
:
return
tuple
()
size_factors
=
test_info
.
image_size_factors
\
...
...
tests/models/multimodal/generation/vlm_utils/core.py
View file @
390ec889
# SPDX-License-Identifier: Apache-2.0
"""Core test implementation to be shared across modalities."""
from
typing
import
Any
,
Callable
,
Optional
,
Union
from
typing
import
Any
,
Callable
,
Optional
import
torch
from
PIL.Image
import
Image
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
...
...
@@ -11,14 +10,14 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
from
.....conftest
import
HfRunner
,
VllmRunner
from
....registry
import
HF_EXAMPLE_MODELS
from
.types
import
RunnerOutput
from
.types
import
PromptWithMultiModalInput
,
RunnerOutput
def
run_test
(
*
,
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
list
[
tuple
[
list
[
str
],
list
[
Union
[
list
[
Image
],
Image
]]]
],
inputs
:
list
[
PromptWithMultiModalInput
],
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
...
...
@@ -38,7 +37,6 @@ def run_test(
hf_model_kwargs
:
Optional
[
dict
[
str
,
Any
]],
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]],
task
:
TaskOption
=
"auto"
,
runner_mm_key
:
str
=
"images"
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
tensor_parallel_size
:
int
=
1
,
vllm_embeddings
:
Optional
[
torch
.
Tensor
]
=
None
,
...
...
@@ -94,10 +92,16 @@ def run_test(
if
stop_str
:
vllm_kwargs
[
"stop"
]
=
stop_str
for
prompts
,
media
in
vllm_inputs
:
vllm_kwargs
[
runner_mm_key
]
=
media
for
prompts
,
image_data
,
video_data
,
audio_data
in
vllm_inputs
:
mm_data
=
dict
(
images
=
image_data
,
videos
=
video_data
,
audios
=
audio_data
)
vllm_kwargs_with_mm_data
=
vllm_kwargs
|
mm_data
vllm_output
=
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
**
vllm_kwargs
)
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
**
vllm_kwargs_with_mm_data
)
vllm_outputs_per_mm
.
append
(
vllm_output
)
hf_model
=
hf_runner
(
model
,
...
...
@@ -122,14 +126,17 @@ def run_test(
if
stop_str
:
hf_kwargs
[
"stop_strings"
]
=
stop_str
for
prompts
,
media
in
inputs
:
hf_kwargs
[
runner_mm_key
]
=
media
for
prompts
,
image_data
,
video_data
,
audio_data
in
inputs
:
mm_data
=
dict
(
images
=
image_data
,
videos
=
video_data
,
audios
=
audio_data
)
hf_kwargs_with_mm_data
=
hf_kwargs
|
mm_data
hf_output
=
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
tokenizer
=
tokenizer
,
**
hf_kwargs
)
**
hf_kwargs
_with_mm_data
)
hf_outputs_per_mm
.
append
(
hf_output
)
# Apply output processing / sanitation to the vLLM and HF runner results
...
...
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
View file @
390ec889
...
...
@@ -12,7 +12,7 @@ from vllm.multimodal.video import (rescale_video_size, resize_video,
from
.....conftest
import
IMAGE_ASSETS
,
VIDEO_ASSETS
from
.builders
import
build_multi_image_inputs
,
build_single_image_inputs
from
.types
import
ImageSizeWrapper
,
SizeType
from
.types
import
ImageSizeWrapper
,
PromptWithMultiModalInput
,
SizeType
def
multi_image_multi_aspect_ratio_inputs
(
formatter
:
Callable
[[
str
],
str
]):
...
...
@@ -32,10 +32,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"<image>
\n
What is the season?"
,
]
formatted_prompts
=
[
formatter
(
prompt
)
for
prompt
in
img_prompts
]
return
[(
formatted_prompts
,
[
aspect_ratio_images
=
[
[
stop_sign
,
cherry_blossom
],
# Images with different sizes and aspect-ratios
[
...
...
@@ -49,7 +46,14 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
cherry_blossom
.
resize
((
488
,
183
))
],
cherry_blossom
,
])]
]
return
[
PromptWithMultiModalInput
(
prompts
=
formatted_prompts
,
image_data
=
aspect_ratio_images
,
)
]
def
multi_video_multi_aspect_ratio_inputs
(
formatter
:
Callable
[[
str
],
str
],
...
...
@@ -68,10 +72,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
"<video>
\n
Why is this video funny?"
,
]
formatted_prompts
=
[
formatter
(
prompt
)
for
prompt
in
video_prompts
]
return
[(
formatted_prompts
,
[
aspect_ratio_videos
=
[
[
video
,
video
],
# Videos with different sizes and aspect-ratios
[
...
...
@@ -85,7 +86,14 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
resize_video
(
video
,
(
488
,
183
))
],
video
,
])]
]
return
[
PromptWithMultiModalInput
(
prompts
=
formatted_prompts
,
video_data
=
aspect_ratio_videos
,
)
]
def
different_patch_input_cases_internvl
():
...
...
tests/models/multimodal/generation/vlm_utils/model_utils.py
View file @
390ec889
...
...
@@ -237,6 +237,18 @@ def minimax_vl_01_hf_output(hf_output: RunnerOutput,
return
output_ids
,
output_str
,
out_logprobs
def
ultravox_trunc_hf_output
(
hf_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
output_ids
,
output_str
,
out_logprobs
=
hf_output
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
eos_token
=
tokenizer
.
decode
(
eos_token_id
)
if
output_str
.
endswith
(
eos_token
):
output_str
=
output_str
.
split
(
eos_token
)[
0
]
return
output_ids
,
output_str
,
out_logprobs
####### Functions for converting image assets to embeddings
def
get_llava_embeddings
(
image_assets
:
ImageTestAssets
):
return
[
asset
.
image_embeds
for
asset
in
image_assets
]
...
...
tests/models/multimodal/generation/vlm_utils/runners.py
View file @
390ec889
...
...
@@ -4,8 +4,8 @@ types / modalities.
"""
from
pathlib
import
PosixPath
from
.....conftest
import
(
HfRunner
,
ImageTestAssets
,
VideoTestAssets
,
VllmRunner
)
from
.....conftest
import
(
AudioTestAssets
,
HfRunner
,
ImageTestAssets
,
VideoTestAssets
,
VllmRunner
)
from
.
import
builders
,
core
from
.types
import
ExpandableVLMTestArgs
,
VLMTestInfo
...
...
@@ -30,7 +30,6 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"image"
:
1
},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
"images"
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
...
...
@@ -53,7 +52,6 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_assets
)},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
"images"
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
...
...
@@ -77,7 +75,6 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
limit_mm_per_prompt
=
{
"image"
:
1
},
vllm_embeddings
=
vllm_embeddings
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
"images"
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
...
...
@@ -105,7 +102,30 @@ def run_video_test(
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"video"
:
len
(
video_assets
)},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
"videos"
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
def
run_audio_test
(
*
,
model_test_info
:
VLMTestInfo
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
audio_assets
:
AudioTestAssets
,
):
inputs
=
builders
.
build_audio_inputs_from_test_info
(
model_test_info
,
audio_assets
)
core
.
run_test
(
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
inputs
=
inputs
,
model
=
test_case
.
model
,
dtype
=
test_case
.
dtype
,
max_tokens
=
test_case
.
max_tokens
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"audio"
:
1
},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
...
...
@@ -120,11 +140,9 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
inputs
=
test_case
.
custom_test_opts
.
inputs
limit_mm_per_prompt
=
test_case
.
custom_test_opts
.
limit_mm_per_prompt
runner_mm_key
=
test_case
.
custom_test_opts
.
runner_mm_key
# Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
# Inputs and limit_mm_per_prompt should all be set
assert
inputs
is
not
None
assert
limit_mm_per_prompt
is
not
None
assert
runner_mm_key
is
not
None
core
.
run_test
(
hf_runner
=
hf_runner
,
...
...
@@ -136,5 +154,4 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
runner_mm_key
=
runner_mm_key
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
tests/models/multimodal/generation/vlm_utils/types.py
View file @
390ec889
...
...
@@ -6,7 +6,6 @@ from pathlib import PosixPath
from
typing
import
Any
,
Callable
,
NamedTuple
,
Optional
,
Union
import
torch
from
PIL.Image
import
Image
from
pytest
import
MarkDecorator
from
transformers
import
AutoModelForCausalLM
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
...
...
@@ -15,18 +14,25 @@ from vllm.config import TaskOption
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
ImageTestAssets
from
.....conftest
import
(
AUDIO_ASSETS
,
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
ImageTestAssets
,
PromptAudioInput
,
PromptImageInput
,
PromptVideoInput
)
from
....utils
import
check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
TEST_IMG_PLACEHOLDER
=
"<vlm_image>"
TEST_VIDEO_PLACEHOLDER
=
"<vlm_video>"
TEST_AUDIO_PLACEHOLDER
=
"<lmm_audio>"
# yapf: disable
SINGLE_IMAGE_BASE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
f
"
{
TEST_IMG_PLACEHOLDER
}
What's the content of the image?"
,
"cherry_blossom"
:
f
"
{
TEST_IMG_PLACEHOLDER
}
What is the season?"
,
})
SINGLE_AUDIO_BASE_PROMPT
=
AUDIO_ASSETS
.
prompts
({
"mary_had_lamb"
:
f
"
{
TEST_AUDIO_PLACEHOLDER
}
Transcribe this audio into English."
,
# noqa: E501
"winning_call"
:
f
"
{
TEST_AUDIO_PLACEHOLDER
}
What is happening in this audio clip?"
,
# noqa: E501
})
MULTI_IMAGE_BASE_PROMPT
=
f
"Image-1:
{
TEST_IMG_PLACEHOLDER
}
Image-2:
{
TEST_IMG_PLACEHOLDER
}
Describe the two images in detail.
\n
"
# noqa: E501
VIDEO_BASE_PROMPT
=
f
"
{
TEST_VIDEO_PLACEHOLDER
}
Why is this video funny?"
...
...
@@ -38,12 +44,21 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
# yapf: enable
class
PromptWithMultiModalInput
(
NamedTuple
):
"""Holds the multimodal input for a single test case."""
prompts
:
list
[
str
]
image_data
:
Optional
[
PromptImageInput
]
=
None
video_data
:
Optional
[
PromptVideoInput
]
=
None
audio_data
:
Optional
[
PromptAudioInput
]
=
None
class
VLMTestType
(
Enum
):
IMAGE
=
1
MULTI_IMAGE
=
2
EMBEDDING
=
3
VIDEO
=
4
CUSTOM_INPUTS
=
5
AUDIO
=
5
CUSTOM_INPUTS
=
6
class
SizeType
(
Enum
):
...
...
@@ -52,10 +67,8 @@ class SizeType(Enum):
class
CustomTestOptions
(
NamedTuple
):
inputs
:
list
[
tuple
[
list
[
str
],
list
[
Union
[
list
[
Image
],
Image
]]]
]
inputs
:
list
[
PromptWithMultiModalInput
]
limit_mm_per_prompt
:
dict
[
str
,
int
]
# kwarg to pass multimodal data in as to vllm/hf runner instances.
runner_mm_key
:
str
=
"images"
class
ImageSizeWrapper
(
NamedTuple
):
...
...
@@ -75,6 +88,7 @@ class VLMTestInfo(NamedTuple):
prompt_formatter
:
Optional
[
Callable
[[
str
],
str
]]
=
None
img_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<image>
\n
"
video_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<video>
\n
"
audio_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<audio>
\n
"
# Most models work on the single / multi-image prompts above, but in some
# cases the log prob check fails, e.g., for paligemma. We allow passing
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment