Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
afd0da21
Commit
afd0da21
authored
Feb 03, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.1' into v0.7.1-dev
parents
1a11f127
4f4d427a
Changes
587
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1389 additions
and
269 deletions
+1389
-269
tests/models/decoder_only/language/test_jamba.py
tests/models/decoder_only/language/test_jamba.py
+5
-2
tests/models/decoder_only/language/test_mamba.py
tests/models/decoder_only/language/test_mamba.py
+5
-2
tests/models/decoder_only/language/test_models.py
tests/models/decoder_only/language/test_models.py
+9
-2
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+127
-77
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+8
-9
tests/models/decoder_only/vision_language/test_qwen2_vl.py
tests/models/decoder_only/vision_language/test_qwen2_vl.py
+34
-160
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+143
-3
tests/models/embedding/language/test_cls_models.py
tests/models/embedding/language/test_cls_models.py
+5
-2
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+12
-6
tests/models/embedding/language/test_scoring.py
tests/models/embedding/language/test_scoring.py
+100
-0
tests/models/encoder_decoder/audio_language/__init__.py
tests/models/encoder_decoder/audio_language/__init__.py
+0
-0
tests/models/encoder_decoder/audio_language/test_whisper.py
tests/models/encoder_decoder/audio_language/test_whisper.py
+136
-0
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+208
-0
tests/models/multimodal/__init__.py
tests/models/multimodal/__init__.py
+0
-0
tests/models/multimodal/processing/__init__.py
tests/models/multimodal/processing/__init__.py
+0
-0
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+204
-0
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_idefics3.py
+3
-3
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+3
-3
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+193
-0
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+194
-0
No files found.
Too many changes to show.
To preserve performance only
587 of 587+
files are displayed.
Plain diff
Email patch
tests/models/decoder_only/language/test_jamba.py
View file @
afd0da21
...
@@ -35,10 +35,13 @@ def test_models(
...
@@ -35,10 +35,13 @@ def test_models(
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# This test is for verifying whether the model's extra_repr
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
def
print_model
(
model
):
model_runner
.
model
)
print
(
model
)
vllm_model
.
apply_model
(
print_model
)
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
tests/models/decoder_only/language/test_mamba.py
View file @
afd0da21
...
@@ -53,10 +53,13 @@ def test_models(
...
@@ -53,10 +53,13 @@ def test_models(
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# This test is for verifying whether the model's extra_repr
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
def
print_model
(
model
):
model_runner
.
model
)
print
(
model
)
vllm_model
.
apply_model
(
print_model
)
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
tests/models/decoder_only/language/test_models.py
View file @
afd0da21
...
@@ -50,6 +50,10 @@ from ....utils import models_path_prefix
...
@@ -50,6 +50,10 @@ from ....utils import models_path_prefix
),
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
)),
# stablelm
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
)),
# stablelm
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
)),
# starcoder2
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
)),
# starcoder2
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"ehristoforu/Falcon3-MoE-2x7B-Insruct"
),
# mixtral
marks
=
[
pytest
.
mark
.
cpu_model
],
)
])
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
...
@@ -71,10 +75,13 @@ def test_models(
...
@@ -71,10 +75,13 @@ def test_models(
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
# This test is for verifying whether the model's extra_repr
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
def
print_model
(
model
):
model_runner
.
model
)
print
(
model
)
vllm_model
.
apply_model
(
print_model
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_0_lst
=
hf_outputs
,
...
...
tests/models/decoder_only/vision_language/test_models.py
View file @
afd0da21
...
@@ -10,7 +10,7 @@ from typing import Type
...
@@ -10,7 +10,7 @@ from typing import Type
import
os
import
os
import
pytest
import
pytest
from
transformers
import
AutoModelForVision2Seq
from
transformers
import
AutoModelForVision2Seq
from
transformers
.utils
import
is_flash_attn_2_available
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
identity
from
vllm.utils
import
identity
...
@@ -141,12 +141,7 @@ VLM_TEST_SETTINGS = {
...
@@ -141,12 +141,7 @@ VLM_TEST_SETTINGS = {
#### Extended model tests
#### Extended model tests
"aria"
:
VLMTestInfo
(
"aria"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"rhymes-ai/Aria"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"rhymes-ai/Aria"
)],
tokenizer_mode
=
"slow"
,
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
,
),
dtype
=
"bfloat16"
,
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>user
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>user
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<fim_prefix><|img|><fim_suffix>
\n
"
,
img_idx_to_prompt
=
lambda
idx
:
"<fim_prefix><|img|><fim_suffix>
\n
"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
...
@@ -162,8 +157,8 @@ VLM_TEST_SETTINGS = {
...
@@ -162,8 +157,8 @@ VLM_TEST_SETTINGS = {
max_tokens
=
64
,
max_tokens
=
64
,
marks
=
[
marks
=
[
pytest
.
mark
.
skipif
(
pytest
.
mark
.
skipif
(
not
is_flash_attn_2_available
()
,
TRANSFORMERS_VERSION
<
"4.48.0"
,
reason
=
"
M
odel
needs flash-attn for numeric convergence.
"
,
reason
=
"
HF m
odel
requires transformers>=4.48.0
"
,
),
),
large_gpu_mark
(
min_gb
=
64
),
large_gpu_mark
(
min_gb
=
64
),
],
],
...
@@ -181,6 +176,7 @@ VLM_TEST_SETTINGS = {
...
@@ -181,6 +176,7 @@ VLM_TEST_SETTINGS = {
test_type
=
VLMTestType
.
IMAGE
,
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
"pixel_values"
...
@@ -192,6 +188,30 @@ VLM_TEST_SETTINGS = {
...
@@ -192,6 +188,30 @@ VLM_TEST_SETTINGS = {
max_tokens
=
8
,
max_tokens
=
8
,
dtype
=
"bfloat16"
,
dtype
=
"bfloat16"
,
),
),
"deepseek_vl_v2"
:
VLMTestInfo
(
models
=
[
"Isotr0py/deepseek-vl2-tiny"
],
# model repo using dynamic module
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|User|>:
{
img_prompt
}
\n\n
<|Assistant|>: "
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>
\n
What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<image>
\n
Please infer the season with reason in details."
,
# noqa: E501
}),
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
vllm_runner_kwargs
=
{
"hf_overrides"
:
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}},
# noqa: E501
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"images"
),
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
stop_str
=
[
"<|end▁of▁sentence|>"
,
"<|begin▁of▁sentence|>"
],
# noqa: E501
image_size_factors
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
),
(
0.1
,
0.5
,
1.0
)],
marks
=
[
pytest
.
mark
.
skipif
(
TRANSFORMERS_VERSION
>=
"4.48.0"
,
reason
=
"HF model is not compatible with transformers>=4.48.0"
,
)
],
),
"fuyu"
:
VLMTestInfo
(
"fuyu"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
)],
test_type
=
VLMTestType
.
IMAGE
,
test_type
=
VLMTestType
.
IMAGE
,
...
@@ -214,7 +234,7 @@ VLM_TEST_SETTINGS = {
...
@@ -214,7 +234,7 @@ VLM_TEST_SETTINGS = {
dtype
=
"bfloat16"
,
dtype
=
"bfloat16"
,
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
patch_hf_runner
=
model_utils
.
glm_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
glm_patch_hf_runner
,
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
),
"h2ovl"
:
VLMTestInfo
(
"h2ovl"
:
VLMTestInfo
(
models
=
[
models
=
[
...
@@ -263,6 +283,7 @@ VLM_TEST_SETTINGS = {
...
@@ -263,6 +283,7 @@ VLM_TEST_SETTINGS = {
dtype
=
"bfloat16"
,
dtype
=
"bfloat16"
,
use_tokenizer_eos
=
True
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
),
"llava_next"
:
VLMTestInfo
(
"llava_next"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)],
...
@@ -277,10 +298,8 @@ VLM_TEST_SETTINGS = {
...
@@ -277,10 +298,8 @@ VLM_TEST_SETTINGS = {
),
),
limit_mm_per_prompt
=
{
"image"
:
4
},
limit_mm_per_prompt
=
{
"image"
:
4
},
)],
)],
# Llava-next tests fixed sizes & the default size factors
image_sizes
=
[((
1669
,
2560
),
(
2560
,
1669
),
(
183
,
488
),
(
488
,
183
))],
),
),
"llava_one
_
vision"
:
VLMTestInfo
(
"llava_onevision"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
)],
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
...
@@ -291,8 +310,6 @@ VLM_TEST_SETTINGS = {
...
@@ -291,8 +310,6 @@ VLM_TEST_SETTINGS = {
),
),
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
# Llava-one-vision tests fixed sizes & the default size factors
image_sizes
=
[((
1669
,
2560
),
(
2560
,
1669
),
(
183
,
488
),
(
488
,
183
))],
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_video_multi_aspect_ratio_inputs
(
inputs
=
custom_inputs
.
multi_video_multi_aspect_ratio_inputs
(
formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
...
@@ -309,7 +326,6 @@ VLM_TEST_SETTINGS = {
...
@@ -309,7 +326,6 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_video_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_video_vllm_to_hf_output
,
image_sizes
=
[((
1669
,
2560
),
(
2560
,
1669
),
(
183
,
488
),
(
488
,
183
))],
),
),
"mantis"
:
VLMTestInfo
(
"mantis"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
)],
...
@@ -336,6 +352,20 @@ VLM_TEST_SETTINGS = {
...
@@ -336,6 +352,20 @@ VLM_TEST_SETTINGS = {
postprocess_inputs
=
model_utils
.
wrap_inputs_post_processor
,
postprocess_inputs
=
model_utils
.
wrap_inputs_post_processor
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
),
),
"minicpmo_26"
:
VLMTestInfo
(
models
=
[
"openbmb/MiniCPM-o-2_6"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
{
img_prompt
}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"(<image>./</image>)
\n
"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
postprocess_inputs
=
model_utils
.
ignore_inputs_post_processor
(
"image_sizes"
),
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmo_patch_hf_runner
),
"minicpmv_26"
:
VLMTestInfo
(
"minicpmv_26"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-V-2_6"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-V-2_6"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
...
@@ -349,6 +379,16 @@ VLM_TEST_SETTINGS = {
...
@@ -349,6 +379,16 @@ VLM_TEST_SETTINGS = {
),
),
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
),
),
"molmo"
:
VLMTestInfo
(
models
=
[
"allenai/Molmo-7B-D-0924"
],
test_type
=
(
VLMTestType
.
IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
"User: "
+
img_prompt
+
" Assistant:"
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
image_size_factors
=
[(),(
1.0
,
1.0
,
1.0
)],
patch_hf_runner
=
model_utils
.
mlomo_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
molmo_post_processor
,
),
# Tests for phi3v currently live in another file because of a bug in
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
# transformers. Once this issue is fixed, we can enable them here instead.
# https://github.com/huggingface/transformers/issues/34307
# https://github.com/huggingface/transformers/issues/34307
...
@@ -434,7 +474,7 @@ VLM_TEST_SETTINGS = {
...
@@ -434,7 +474,7 @@ VLM_TEST_SETTINGS = {
)
for
inp
in
custom_inputs
.
different_patch_input_cases_internvl
()
)
for
inp
in
custom_inputs
.
different_patch_input_cases_internvl
()
],
],
),
),
"llava_one
_
vision-multiple-images"
:
VLMTestInfo
(
"llava_onevision-multiple-images"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
)],
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
16384
,
max_model_len
=
16384
,
...
@@ -497,12 +537,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
...
@@ -497,12 +537,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
# - image embeddings
# - image embeddings
# - video
# - video
# - custom inputs
# - custom inputs
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
IMAGE
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
False
,
test_type
=
VLMTestType
.
IMAGE
,
))
fork_new_process_for_each_test
=
False
,
))
def
test_single_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
def
test_single_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
Type
[
HfRunner
],
...
@@ -519,12 +560,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
...
@@ -519,12 +560,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
)
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
MULTI_IMAGE
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
False
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
))
fork_new_process_for_each_test
=
False
,
))
def
test_multi_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
def
test_multi_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
Type
[
HfRunner
],
...
@@ -541,12 +583,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
...
@@ -541,12 +583,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
)
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
EMBEDDING
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
False
,
test_type
=
VLMTestType
.
EMBEDDING
,
))
fork_new_process_for_each_test
=
False
,
))
def
test_image_embedding_models
(
model_type
:
str
,
def
test_image_embedding_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
Type
[
HfRunner
],
...
@@ -562,12 +605,13 @@ def test_image_embedding_models(model_type: str,
...
@@ -562,12 +605,13 @@ def test_image_embedding_models(model_type: str,
)
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
VIDEO
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
False
,
test_type
=
VLMTestType
.
VIDEO
,
))
fork_new_process_for_each_test
=
False
,
))
def
test_video_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
def
test_video_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
video_assets
:
_VideoAssets
):
video_assets
:
_VideoAssets
):
...
@@ -581,12 +625,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
...
@@ -581,12 +625,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
)
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
False
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
))
fork_new_process_for_each_test
=
False
,
))
def
test_custom_inputs_models
(
def
test_custom_inputs_models
(
model_type
:
str
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
...
@@ -603,12 +648,13 @@ def test_custom_inputs_models(
...
@@ -603,12 +648,13 @@ def test_custom_inputs_models(
#### Tests filtering for things running each test as a new process
#### Tests filtering for things running each test as a new process
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
IMAGE
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
True
,
test_type
=
VLMTestType
.
IMAGE
,
))
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_single_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
def
test_single_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
...
@@ -626,12 +672,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
...
@@ -626,12 +672,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
)
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
MULTI_IMAGE
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
True
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
))
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_multi_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
def
test_multi_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
...
@@ -649,12 +696,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
...
@@ -649,12 +696,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
)
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
EMBEDDING
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
True
,
test_type
=
VLMTestType
.
EMBEDDING
,
))
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_image_embedding_models_heavy
(
model_type
:
str
,
def
test_image_embedding_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
...
@@ -671,12 +719,13 @@ def test_image_embedding_models_heavy(model_type: str,
...
@@ -671,12 +719,13 @@ def test_image_embedding_models_heavy(model_type: str,
)
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
VIDEO
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
True
,
test_type
=
VLMTestType
.
VIDEO
,
))
fork_new_process_for_each_test
=
True
,
))
def
test_video_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
def
test_video_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
Type
[
VllmRunner
],
...
@@ -691,12 +740,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
...
@@ -691,12 +740,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
)
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
@
pytest
.
mark
.
parametrize
(
get_parametrized_options
(
"model_type,test_case"
,
VLM_TEST_SETTINGS
,
get_parametrized_options
(
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
VLM_TEST_SETTINGS
,
fork_new_process_for_each_test
=
True
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
))
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_custom_inputs_models_heavy
(
def
test_custom_inputs_models_heavy
(
model_type
:
str
,
model_type
:
str
,
...
...
tests/models/decoder_only/vision_language/test_pixtral.py
View file @
afd0da21
...
@@ -138,10 +138,10 @@ def _dump_outputs_w_logprobs(
...
@@ -138,10 +138,10 @@ def _dump_outputs_w_logprobs(
outputs
:
OutputsLogprobs
,
outputs
:
OutputsLogprobs
,
filename
:
"StrPath"
,
filename
:
"StrPath"
,
)
->
None
:
)
->
None
:
json_data
=
[(
tokens
,
text
,
json_data
=
[(
tokens
,
text
,
[{
[{
k
:
asdict
(
v
)
k
:
asdict
(
v
)
for
k
,
v
in
token_logprobs
.
items
()
}
for
k
,
v
in
token_logprobs
.
items
()
for
token_logprobs
in
(
logprobs
or
[])])
}
for
token_logprobs
in
(
logprobs
or
[])])
for
tokens
,
text
,
logprobs
in
outputs
]
for
tokens
,
text
,
logprobs
in
outputs
]
with
open
(
filename
,
"w"
)
as
f
:
with
open
(
filename
,
"w"
)
as
f
:
...
@@ -152,11 +152,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
...
@@ -152,11 +152,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with
open
(
filename
,
"rb"
)
as
f
:
with
open
(
filename
,
"rb"
)
as
f
:
json_data
=
json
.
load
(
f
)
json_data
=
json
.
load
(
f
)
return
[(
tokens
,
text
,
return
[(
tokens
,
text
,
[{
[{
int
(
k
):
Logprob
(
**
v
)
int
(
k
):
Logprob
(
**
v
)
for
k
,
v
in
token_logprobs
.
items
()}
for
k
,
v
in
token_logprobs
.
items
()
for
token_logprobs
in
logprobs
])
}
for
token_logprobs
in
logprobs
])
for
tokens
,
text
,
logprobs
in
json_data
]
for
tokens
,
text
,
logprobs
in
json_data
]
@
large_gpu_test
(
min_gb
=
80
)
@
large_gpu_test
(
min_gb
=
80
)
...
...
tests/models/decoder_only/vision_language/test_qwen2_vl.py
View file @
afd0da21
...
@@ -6,7 +6,6 @@ import pytest
...
@@ -6,7 +6,6 @@ import pytest
import
torch
import
torch
from
PIL
import
Image
from
PIL
import
Image
from
vllm.entrypoints.llm
import
LLM
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.video
import
rescale_video_size
,
sample_frames_from_video
from
vllm.multimodal.video
import
rescale_video_size
,
sample_frames_from_video
...
@@ -71,7 +70,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
...
@@ -71,7 +70,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
def
batch_make_image_embeddings
(
def
batch_make_image_embeddings
(
image_batches
:
List
[
Union
[
Image
.
Image
,
List
[
Image
.
Image
]]],
processor
,
image_batches
:
List
[
Union
[
Image
.
Image
,
List
[
Image
.
Image
]]],
processor
,
llm
:
LLM
)
->
List
[
Qwen2VLPromptImageEmbeddingInput
]:
llm
:
VllmRunner
)
->
List
[
Qwen2VLPromptImageEmbeddingInput
]:
"""batched image embeddings for Qwen2-VL
"""batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch,
This will infer all images' embeddings in a single batch,
...
@@ -107,17 +106,19 @@ def batch_make_image_embeddings(
...
@@ -107,17 +106,19 @@ def batch_make_image_embeddings(
pixel_values
=
preprocess_result
[
"pixel_values"
]
pixel_values
=
preprocess_result
[
"pixel_values"
]
image_grid_thw
=
preprocess_result
[
"image_grid_thw"
]
image_grid_thw
=
preprocess_result
[
"image_grid_thw"
]
# pixel values to embeddin
d
s & grid_thws
# pixel values to embeddin
g
s & grid_thws
with
torch
.
no_grad
(
):
def
get_image_embeds
(
model
):
visual
=
llm
.
llm_engine
.
model_executor
.
driver_worker
.
\
with
torch
.
no_grad
():
model_runner
.
model
.
visual
visual
=
model
.
visual
pixel_values_on_device
=
pixel_values
.
to
(
visual
.
device
,
pixel_values_on_device
=
pixel_values
.
to
(
visual
.
device
,
dtype
=
visual
.
dtype
)
dtype
=
visual
.
dtype
)
image_grid_thw_on_device
=
image_grid_thw
.
to
(
visual
.
device
,
image_grid_thw_on_device
=
image_grid_thw
.
to
(
visual
.
device
,
dtype
=
torch
.
int64
)
dtype
=
torch
.
int64
)
image_embeds
=
visual
(
pixel_values_on_device
,
return
visual
(
pixel_values_on_device
,
grid_thw
=
image_grid_thw_on_device
)
grid_thw
=
image_grid_thw_on_device
)
image_embeds
=
torch
.
concat
(
llm
.
apply_model
(
get_image_embeds
))
# split into original batches
# split into original batches
result
:
List
[
Qwen2VLPromptImageEmbeddingInput
]
=
[]
result
:
List
[
Qwen2VLPromptImageEmbeddingInput
]
=
[]
...
@@ -126,11 +127,10 @@ def batch_make_image_embeddings(
...
@@ -126,11 +127,10 @@ def batch_make_image_embeddings(
for
image_batch
in
image_batches_
:
for
image_batch
in
image_batches_
:
cur_batch_image_count
=
len
(
image_batch
)
cur_batch_image_count
=
len
(
image_batch
)
merge_size
=
image_processor
.
merge_size
merge_size
=
image_processor
.
merge_size
cur_batch_embed_len
=
sum
(
[
cur_batch_embed_len
=
sum
(
grid_thw
.
prod
()
//
merge_size
//
merge_size
grid_thw
.
prod
(
-
1
)
//
merge_size
//
merge_size
for
grid_thw
in
image_grid_thw
[
image_counter
:
image_counter
+
for
grid_thw
in
image_grid_thw
[
image_counter
:
image_counter
+
cur_batch_image_count
]
cur_batch_image_count
])
])
result
.
append
({
result
.
append
({
"image_embeds"
:
"image_embeds"
:
...
@@ -153,7 +153,7 @@ def batch_make_image_embeddings(
...
@@ -153,7 +153,7 @@ def batch_make_image_embeddings(
def
batch_make_video_embeddings
(
def
batch_make_video_embeddings
(
video_batches
:
PromptVideoInput
,
processor
,
video_batches
:
PromptVideoInput
,
processor
,
llm
:
LLM
)
->
List
[
Qwen2VLPromptVideoEmbeddingInput
]:
llm
:
VllmRunner
)
->
List
[
Qwen2VLPromptVideoEmbeddingInput
]:
"""batched video embeddings for Qwen2-VL
"""batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames.
A NDArray represents a single video's all frames.
...
@@ -189,17 +189,19 @@ def batch_make_video_embeddings(
...
@@ -189,17 +189,19 @@ def batch_make_video_embeddings(
pixel_values
=
preprocess_result
[
"pixel_values_videos"
]
pixel_values
=
preprocess_result
[
"pixel_values_videos"
]
video_grid_thw
=
preprocess_result
[
"video_grid_thw"
]
video_grid_thw
=
preprocess_result
[
"video_grid_thw"
]
# pixel values to embeddinds & grid_thws
# pixel values to embeddings & grid_thws
with
torch
.
no_grad
():
def
get_image_embeds
(
model
):
visual
=
llm
.
llm_engine
.
model_executor
.
driver_worker
.
\
with
torch
.
no_grad
():
model_runner
.
model
.
visual
visual
=
model
.
visual
pixel_values_on_device
=
pixel_values
.
to
(
visual
.
device
,
dtype
=
visual
.
dtype
)
video_grid_thw_on_device
=
video_grid_thw
.
to
(
visual
.
device
,
dtype
=
torch
.
int64
)
return
visual
(
pixel_values_on_device
,
grid_thw
=
video_grid_thw_on_device
)
pixel_values_on_device
=
pixel_values
.
to
(
visual
.
device
,
video_embeds
=
torch
.
concat
(
llm
.
apply_model
(
get_image_embeds
))
dtype
=
visual
.
dtype
)
video_grid_thw_on_device
=
video_grid_thw
.
to
(
visual
.
device
,
dtype
=
torch
.
int64
)
video_embeds
=
visual
(
pixel_values_on_device
,
grid_thw
=
video_grid_thw_on_device
)
# split into original batches
# split into original batches
result
:
List
[
Qwen2VLPromptVideoEmbeddingInput
]
=
[]
result
:
List
[
Qwen2VLPromptVideoEmbeddingInput
]
=
[]
...
@@ -208,11 +210,10 @@ def batch_make_video_embeddings(
...
@@ -208,11 +210,10 @@ def batch_make_video_embeddings(
for
video_batch
in
video_batches_
:
for
video_batch
in
video_batches_
:
cur_batch_video_count
=
len
(
video_batch
)
cur_batch_video_count
=
len
(
video_batch
)
merge_size
=
image_processor
.
merge_size
merge_size
=
image_processor
.
merge_size
cur_batch_embed_len
=
sum
(
[
cur_batch_embed_len
=
sum
(
grid_thw
.
prod
()
//
merge_size
//
merge_size
grid_thw
.
prod
(
-
1
)
//
merge_size
//
merge_size
for
grid_thw
in
video_grid_thw
[
video_counter
:
video_counter
+
for
grid_thw
in
video_grid_thw
[
video_counter
:
video_counter
+
cur_batch_video_count
]
cur_batch_video_count
])
])
result
.
append
({
result
.
append
({
"video_embeds"
:
"video_embeds"
:
...
@@ -282,9 +283,9 @@ def run_embedding_input_test(
...
@@ -282,9 +283,9 @@ def run_embedding_input_test(
max_tokens
,
max_tokens
,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
images
=
batch_make_image_embeddings
(
images
=
batch_make_image_embeddings
(
images
,
processor
,
vllm_model
.
model
)
if
images
else
None
,
images
,
processor
,
vllm_model
)
if
images
else
None
,
videos
=
batch_make_video_embeddings
(
videos
=
batch_make_video_embeddings
(
videos
,
processor
,
vllm_model
.
model
)
if
videos
else
None
)
videos
,
processor
,
vllm_model
)
if
videos
else
None
)
for
prompts
,
images
,
videos
in
inputs
for
prompts
,
images
,
videos
in
inputs
]
]
...
@@ -429,130 +430,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
...
@@ -429,130 +430,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
mm_limit
=
1
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
)
)
def
run_chunked_prefill_test
(
vllm_runner
:
Type
[
VllmRunner
],
inputs
:
List
[
Tuple
[
List
[
str
],
PromptImageInput
,
PromptVideoInput
]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Compare inference result between
chunked prefill disabled and chunked prefill enabled
"""
# NOTE:
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
task
=
"generate"
,
max_model_len
=
4000
,
max_num_seqs
=
4
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
,
"video"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
)
as
vllm_model
:
outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
or
None
,
videos
=
videos
or
None
)
for
prompts
,
images
,
videos
in
inputs
]
with
vllm_runner
(
model
,
task
=
"generate"
,
max_model_len
=
4000
,
max_num_seqs
=
4
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
,
"video"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enable_chunked_prefill
=
True
,
# should be small enough to ensure prefilling is chunked
max_num_batched_tokens
=
32
,
mm_processor_kwargs
=
{
"max_pixels"
:
16
*
28
*
28
,
})
as
vllm_model_chunked
:
outputs_per_case_chunked
=
[
vllm_model_chunked
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
or
None
,
videos
=
videos
or
None
)
for
prompts
,
images
,
videos
in
inputs
]
for
outputs
,
\
outputs_chunked
\
in
zip
(
outputs_per_case
,
outputs_per_case_chunked
):
check_logprobs_close
(
outputs_0_lst
=
outputs
,
outputs_1_lst
=
outputs_chunked
,
name_0
=
"non_chunked"
,
name_1
=
"chunked"
,
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_qwen2_vl_mrope_chunked_prefill
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
"""
Test Qwen2-VL's chunked prefill with M-RoPE
"""
prompts
=
[
qwen2_vl_chat_template
(
IMAGE_PLACEHOLDER
,
prompt
)
for
prompt
in
example_prompts
[:
1
]
]
# 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs,
# so an image is included in the inputs
# 2. however, Qwen2-VL currently won't work properly
# when chunked prefill is enabled and there are some multi-modal inputs,
# here use a hacky way: provide a **zero-length** image to make it happy
#
# and finally we achieved:
# (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests
zero_len_image
=
{
"image_embeds"
:
torch
.
empty
((
0
,
MODEL_HIDDEN_SIZE
)),
"image_grid_thw"
:
torch
.
tensor
([[
0
,
0
,
0
]])
}
images
=
[
zero_len_image
]
*
len
(
prompts
)
inputs_per_case
:
List
[
Tuple
[
List
[
str
],
PromptImageInput
,
PromptVideoInput
]]
=
[
(
prompts
,
images
,
[]),
]
run_chunked_prefill_test
(
vllm_runner
,
inputs_per_case
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
View file @
afd0da21
...
@@ -5,17 +5,20 @@ typically specific to a small subset of models.
...
@@ -5,17 +5,20 @@ typically specific to a small subset of models.
import
re
import
re
import
types
import
types
from
pathlib
import
PosixPath
from
pathlib
import
PosixPath
from
typing
import
Callable
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
transformers
import
AutoConfig
,
AutoTokenizer
,
BatchEncoding
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
BatchEncoding
,
GenerationConfig
)
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
.....conftest
import
HfRunner
,
ImageAsset
,
_ImageAssets
from
.....conftest
import
(
HfRunner
,
ImageAsset
,
PromptAudioInput
,
PromptImageInput
,
PromptVideoInput
,
_ImageAssets
)
from
....utils
import
TokensTextLogprobs
from
.types
import
RunnerOutput
from
.types
import
RunnerOutput
...
@@ -180,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
...
@@ -180,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
####### Post-processors for HF outputs
####### Post-processors for HF outputs
def
deepseekvl2_trunc_hf_output
(
hf_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
output_ids
,
output_str
,
out_logprobs
=
hf_output
if
output_str
.
endswith
(
"<|end▁of▁sentence|>"
):
output_str
=
output_str
.
split
(
"<|end▁of▁sentence|>"
)[
0
]
return
output_ids
,
output_str
,
out_logprobs
def
minicpmv_trunc_hf_output
(
hf_output
:
RunnerOutput
,
def
minicpmv_trunc_hf_output
(
hf_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
model
:
str
)
->
RunnerOutput
:
output_ids
,
output_str
,
out_logprobs
=
hf_output
output_ids
,
output_str
,
out_logprobs
=
hf_output
...
@@ -222,6 +233,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
...
@@ -222,6 +233,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
return
{
"model_inputs"
:
hf_inputs
}
return
{
"model_inputs"
:
hf_inputs
}
def
molmo_post_processor
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
hf_inputs
=
cast_dtype_post_processor
(
"images"
)(
hf_inputs
,
dtype
)
return
{
k
:
v
.
unsqueeze
(
0
)
for
k
,
v
in
hf_inputs
.
items
()}
####### Prompt path encoders for models that need models on disk
####### Prompt path encoders for models that need models on disk
def
qwen_prompt_path_encoder
(
def
qwen_prompt_path_encoder
(
tmp_path
:
PosixPath
,
prompt
:
str
,
assets
:
Union
[
List
[
ImageAsset
],
tmp_path
:
PosixPath
,
prompt
:
str
,
assets
:
Union
[
List
[
ImageAsset
],
...
@@ -253,6 +269,34 @@ def qwen_prompt_path_encoder(
...
@@ -253,6 +269,34 @@ def qwen_prompt_path_encoder(
####### Model-specific HuggingFace runner patchers
####### Model-specific HuggingFace runner patchers
def
deepseekvl2_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for GLM4."""
hf_processor
=
hf_model
.
processor
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
**
kwargs
):
if
isinstance
(
images
,
Image
):
images
=
[
images
]
# inputs is a custom class instead of dict or BatchFeature
inputs
=
hf_processor
(
*
args
,
prompt
=
text
,
images
=
images
,
**
kwargs
,
)
inputs
=
{
k
:
inputs
[
k
]
for
k
in
inputs
.
keys
()
# noqa
if
k
not
in
(
"seq_lens"
,
"sft_format"
)
}
inputs
=
BatchEncoding
(
data
=
inputs
,
tensor_type
=
"pt"
)
return
inputs
hf_model
.
processor
=
processor
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
language
.
model
.
embed_tokens
return
hf_model
def
glm_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
glm_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for GLM4."""
"""Patches and returns an instance of the HfRunner to use for GLM4."""
hf_processor
=
hf_model
.
processor
hf_processor
=
hf_model
.
processor
...
@@ -451,3 +495,99 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -451,3 +495,99 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
return
hf_model
def
minicpmo_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
orig_generate
=
hf_model
.
model
.
generate
def
_generate
(
self
,
*
args
,
**
kwargs
):
return
orig_generate
(
*
args
,
decode_text
=
False
,
**
kwargs
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
def
_generate_greedy_logprobs_limit
(
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
TokensTextLogprobs
]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
# Process in batches for inference.
if
len
(
all_inputs
):
input_ids_lst
=
[]
images_lst
=
[]
images_input_idx_lst
=
[]
imges_masks_lst
=
[]
for
inputs
in
all_inputs
:
input_ids_lst
.
append
(
inputs
[
"input_ids"
])
images_lst
.
append
(
inputs
[
"images"
])
images_input_idx_lst
.
append
(
inputs
[
"image_input_idx"
])
imges_masks_lst
.
append
(
inputs
[
"image_masks"
])
batch_inputs
=
{}
batch_inputs
[
'input_ids'
]
=
torch
.
cat
(
input_ids_lst
,
dim
=
0
)
batch_inputs
[
'images'
]
=
torch
.
cat
(
images_lst
,
dim
=
0
)
batch_inputs
[
'image_input_idx'
]
=
torch
.
cat
(
images_input_idx_lst
,
dim
=
0
)
batch_inputs
[
'image_masks'
]
=
torch
.
cat
(
imges_masks_lst
,
dim
=
0
)
outputs
=
self
.
model
.
generate_from_batch
(
batch
=
self
.
wrap_device
(
batch_inputs
,
device
=
self
.
model
.
device
.
type
),
generation_config
=
GenerationConfig
(
max_new_tokens
=
max_tokens
,
stop_strings
=
"<|endoftext|>"
,
do_sample
=
False
,
),
tokenizer
=
self
.
tokenizer
,
output_hidden_states
=
True
,
return_dict_in_generate
=
True
,
)
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
index
in
range
(
len
(
all_inputs
)):
(
seq_logprobs_lst
,
output_len
,
)
=
self
.
_hidden_states_to_logprobs
(
outputs
.
hidden_states
,
num_logprobs
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
outputs
.
sequences
[
index
]
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
outputs
=
zip
(
all_output_ids
,
all_output_strs
,
all_logprobs
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
####### Molmo-specific HuggingFace runner patchers
def
mlomo_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for Molmo."""
hf_processor
=
hf_model
.
processor
def
_processor
(
*
args
,
**
kwargs
):
return
hf_processor
.
process
(
*
args
,
**
kwargs
)
hf_model
.
processor
=
_processor
setattr
(
# noqa: B010
hf_model
,
"generate_greedy_logprobs_limit"
,
types
.
MethodType
(
_generate_greedy_logprobs_limit
,
hf_model
),
)
return
hf_model
tests/models/embedding/language/test_cls_models.py
View file @
afd0da21
...
@@ -26,10 +26,13 @@ def test_classification_models(
...
@@ -26,10 +26,13 @@ def test_classification_models(
)
->
None
:
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
classify
(
example_prompts
)
vllm_outputs
=
vllm_model
.
classify
(
example_prompts
)
# This test is for verifying whether the model's extra_repr
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
def
print_model
(
model
):
model_runner
.
model
)
print
(
model
)
vllm_model
.
apply_model
(
print_model
)
with
hf_runner
(
model
,
with
hf_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
...
...
tests/models/embedding/language/test_embedding.py
View file @
afd0da21
...
@@ -18,15 +18,18 @@ from vllm.platforms import current_platform
...
@@ -18,15 +18,18 @@ from vllm.platforms import current_platform
# [Encoder-only]
# [Encoder-only]
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-base-en-v1.5"
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-base-en-v1.5"
),
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
]),
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
]),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-MiniLM-L12-v2"
)),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-large"
)),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-large"
)),
# [Encoder-decoder]
# [Decoder-only]
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
),
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
]),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
),
marks
=
[
pytest
.
mark
.
core_model
]),
marks
=
[
pytest
.
mark
.
core_model
]),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"ssmits/Qwen2-7B-Instruct-embed-base"
)),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
),
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
]),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
)),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
)),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-Qwen2-7B-instruct"
)),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-Qwen2-7B-instruct"
)),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"ssmits/Qwen2-7B-Instruct-embed-base"
)),
# [Encoder-decoder]
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/stsb-roberta-base-v2"
)),
],
],
)
)
...
@@ -66,10 +69,13 @@ def test_models(
...
@@ -66,10 +69,13 @@ def test_models(
max_model_len
=
None
,
max_model_len
=
None
,
**
vllm_extra_kwargs
)
as
vllm_model
:
**
vllm_extra_kwargs
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
# This test is for verifying whether the model's extra_repr
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
def
print_model
(
model
):
model_runner
.
model
)
print
(
model
)
vllm_model
.
apply_model
(
print_model
)
check_embeddings_close
(
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_0_lst
=
hf_outputs
,
...
...
tests/models/embedding/language/test_scoring.py
View file @
afd0da21
...
@@ -6,6 +6,8 @@ import math
...
@@ -6,6 +6,8 @@ import math
import
os
import
os
import
pytest
import
pytest
import
torch
import
torch.nn.functional
as
F
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
MODELS
=
[
MODELS
=
[
...
@@ -13,6 +15,10 @@ MODELS = [
...
@@ -13,6 +15,10 @@ MODELS = [
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-reranker-v2-m3"
),
# Roberta
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-reranker-v2-m3"
),
# Roberta
]
]
EMBEDDING_MODELS
=
[
"sentence-transformers/all-MiniLM-L12-v2"
,
]
TEXTS_1
=
[
TEXTS_1
=
[
"What is the capital of France?"
,
"What is the capital of France?"
,
"What is the capital of Germany?"
,
"What is the capital of Germany?"
,
...
@@ -89,3 +95,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
...
@@ -89,3 +95,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
EMBEDDING_MODELS
)
def
emb_model_name
(
request
):
yield
request
.
param
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_llm_1_to_1_embedding
(
vllm_runner
,
hf_runner
,
emb_model_name
,
dtype
:
str
):
text_pair
=
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]]
with
hf_runner
(
emb_model_name
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
hf_embeddings
=
hf_model
.
encode
(
text_pair
)
hf_outputs
=
[
F
.
cosine_similarity
(
*
map
(
torch
.
tensor
,
hf_embeddings
),
dim
=
0
)
]
with
vllm_runner
(
emb_model_name
,
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
score
(
text_pair
[
0
],
text_pair
[
1
])
assert
len
(
vllm_outputs
)
==
1
assert
len
(
hf_outputs
)
==
1
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_llm_1_to_N_embedding
(
vllm_runner
,
hf_runner
,
emb_model_name
,
dtype
:
str
):
text_pairs
=
[
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]],
[
TEXTS_1
[
0
],
TEXTS_2
[
1
]],
]
with
hf_runner
(
emb_model_name
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
hf_embeddings
=
[
hf_model
.
encode
(
text_pair
)
for
text_pair
in
text_pairs
]
hf_outputs
=
[
F
.
cosine_similarity
(
*
map
(
torch
.
tensor
,
pair
),
dim
=
0
)
for
pair
in
hf_embeddings
]
with
vllm_runner
(
emb_model_name
,
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
score
(
TEXTS_1
[
0
],
TEXTS_2
)
assert
len
(
vllm_outputs
)
==
2
assert
len
(
hf_outputs
)
==
2
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_llm_N_to_N_embedding
(
vllm_runner
,
hf_runner
,
emb_model_name
,
dtype
:
str
):
text_pairs
=
[
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]],
[
TEXTS_1
[
1
],
TEXTS_2
[
1
]],
]
with
hf_runner
(
emb_model_name
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
hf_embeddings
=
[
hf_model
.
encode
(
text_pair
)
for
text_pair
in
text_pairs
]
hf_outputs
=
[
F
.
cosine_similarity
(
*
map
(
torch
.
tensor
,
pair
),
dim
=
0
)
for
pair
in
hf_embeddings
]
with
vllm_runner
(
emb_model_name
,
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
score
(
TEXTS_1
,
TEXTS_2
)
assert
len
(
vllm_outputs
)
==
2
assert
len
(
hf_outputs
)
==
2
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
tests/models/encoder_decoder/audio_language/__init__.py
0 → 100644
View file @
afd0da21
tests/models/encoder_decoder/audio_language/test_whisper.py
0 → 100644
View file @
afd0da21
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
"""
from
typing
import
Optional
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
....utils
import
fork_new_process_for_each_test
,
multi_gpu_test
PROMPTS
=
[
{
"prompt"
:
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
},
},
{
# Test explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
""
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"winning_call"
).
audio_and_sample_rate
,
},
},
"decoder_prompt"
:
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
,
}
]
EXPECTED
=
{
"openai/whisper-tiny"
:
[
" He has birth words I spoke in the original corner of that. And a"
" little piece of black coat poetry. Mary had a little sandwich,"
" sweet, with white and snow. And everyone had it very went the last"
" would sure to go."
,
" >> And the old one, fit John the way to Edgar Martinez. >> One more"
" to line down the field line for our base camp. Here comes joy. Here"
" is June and the third base. They're going to wave him in. The throw"
" to the plate will be late. The Mariners are going to play for the"
" American League Championship. I don't believe it. It just continues"
" by all five."
],
"openai/whisper-small"
:
[
" The first words I spoke in the original pornograph. A little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite a"
" slow, and everywhere that Mary went the lamb was sure to go."
,
" And the old one pitch on the way to Edgar Martinez one month. Here"
" comes joy. Here is Junior to third base. They're gonna wave him"
" in. The throw to the plate will be late. The Mariners are going to"
" play for the American League Championship. I don't believe it. It"
" just continues. My, oh my."
],
"openai/whisper-medium"
:
[
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite as"
" slow, and everywhere that Mary went the lamb was sure to go."
,
" And the 0-1 pitch on the way to Edgar Martinez swung on the line"
" down the left field line for Obeyshev. Here comes Joy. Here is"
" Jorgen at third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh"
" my."
],
"openai/whisper-large-v3"
:
[
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its feet were quite as"
" slow, and everywhere that Mary went, the lamb was sure to go."
,
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
" Now the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
],
"openai/whisper-large-v3-turbo"
:
[
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its streets were quite"
" as slow, and everywhere that Mary went the lamb was sure to go."
,
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
" down the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
]
}
def
run_test
(
model
:
str
,
*
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
)
->
None
:
prompt_list
=
PROMPTS
*
10
expected_list
=
EXPECTED
[
model
]
*
10
llm
=
LLM
(
model
=
model
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
max_tokens
=
200
,
)
outputs
=
llm
.
generate
(
prompt_list
,
sampling_params
)
for
output
,
expected
in
zip
(
outputs
,
expected_list
):
print
(
output
.
outputs
[
0
].
text
)
assert
output
.
outputs
[
0
].
text
==
expected
@
fork_new_process_for_each_test
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"openai/whisper-small"
,
"openai/whisper-large-v3-turbo"
])
def
test_models
(
model
)
->
None
:
run_test
(
model
,
tensor_parallel_size
=
1
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"openai/whisper-large-v3-turbo"
])
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
def
test_models_distributed
(
model
,
distributed_executor_backend
)
->
None
:
run_test
(
model
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
)
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
afd0da21
...
@@ -2,11 +2,15 @@ from typing import List, Optional, Tuple, Type, overload
...
@@ -2,11 +2,15 @@ from typing import List, Optional, Tuple, Type, overload
import
os
import
os
import
pytest
import
pytest
import
torch
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
)
BatchEncoding
)
from
vllm.attention.backends.flash_attn
import
FlashAttentionMetadata
from
vllm.attention.selector
import
(
_Backend
,
_cached_get_attn_backend
,
from
vllm.attention.selector
import
(
_Backend
,
_cached_get_attn_backend
,
global_force_attn_backend_context_manager
)
global_force_attn_backend_context_manager
)
from
vllm.model_executor.models.mllama
import
(
MLLAMA_IMAGE_TOKEN_ID
,
MllamaForConditionalGeneration
)
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
...
@@ -35,6 +39,29 @@ models = [
...
@@ -35,6 +39,29 @@ models = [
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
]
]
# Indices for inputs
TEXT_ONLY
=
'0'
IMAGE_AT_BEG
=
'1'
IMAGE_AT_MIDDLE
=
'2'
TWO_IMAGES
=
'3'
# Input tokenized
prompt_data
=
{
# Tell me a story
TEXT_ONLY
:
[
41551
,
757
,
264
,
3446
],
# <|image|> What's the content of this image
IMAGE_AT_BEG
:
[
MLLAMA_IMAGE_TOKEN_ID
,
3639
,
596
,
279
,
2262
,
315
,
420
,
2217
,
220
],
# Hello <|image|>What' the content of this image
IMAGE_AT_MIDDLE
:
[
9906
,
220
,
MLLAMA_IMAGE_TOKEN_ID
,
3923
,
6
,
279
,
2262
,
315
,
420
,
2217
],
#<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
TWO_IMAGES
:
[
MLLAMA_IMAGE_TOKEN_ID
,
3957
,
1070
,
264
,
37085
,
304
,
420
,
2217
,
30
,
MLLAMA_IMAGE_TOKEN_ID
,
3923
,
596
,
279
,
10065
,
304
,
420
,
2217
,
30
]
}
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
Optional
[
SampleLogprobs
]],
...
@@ -367,3 +394,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
...
@@ -367,3 +394,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
)
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"attn_backend"
,
LIST_ENC_DEC_SUPPORTED_BACKENDS
)
def
test_regression
(
vllm_runner
,
image_assets
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
attn_backend
:
_Backend
)
->
None
:
stop_sign
=
image_assets
[
0
].
pil_image
with
global_force_attn_backend_context_manager
(
attn_backend
),
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
tensor_parallel_size
=
1
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
_LIMIT_IMAGE_PER_PROMPT
})
as
vllm_model
:
# Regression tests for https://github.com/vllm-project/vllm/issues/10648
# Number of image tags is greater than the number of images provided
prompt
=
"<|begin_of_text|><|image|><|image|> Compare the two images"
# noqa: E501
image
=
stop_sign
with
pytest
.
raises
(
ValueError
):
vllm_model
.
generate_greedy_logprobs
([
prompt
],
max_tokens
,
num_logprobs
,
images
=
[
image
])
# Batch of a text-only and image request that requires cross-attention
prompts
=
[
"What is the capital of spain?"
,
"Text before the image...<|image|>What is in the image?"
,
# noqa: E501
]
images
=
[
None
,
[
stop_sign
],
]
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
,
images
=
images
)
# Test the reverse order too for good measure
prompts
=
[
"<|begin_of_text|>Text before the image...<|image|>What is in the image?"
,
# noqa: E501
"<|begin_of_text|>Hello!"
,
]
images
=
[
[
stop_sign
],
None
,
]
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
,
images
=
images
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"input_indices_and_output"
,
# inputs, (cross_attention_mask, kv_range_for_decode)
[([
TEXT_ONLY
],
(
None
,
None
)),
([
IMAGE_AT_BEG
],
(
None
,
None
)),
([
TEXT_ONLY
,
IMAGE_AT_BEG
],
(
None
,
None
)),
([
IMAGE_AT_MIDDLE
],
((
10
,
12
),
[[
0
,
6
]])),
([
TEXT_ONLY
,
IMAGE_AT_MIDDLE
],
((
14
,
12
),
[[
0
,
6
]])),
([
TEXT_ONLY
,
IMAGE_AT_BEG
,
IMAGE_AT_MIDDLE
],
((
23
,
24
),
[[
0
,
6
],
[
6
,
12
]])),
([
IMAGE_AT_MIDDLE
,
TEXT_ONLY
],
((
14
,
12
),
[[
0
,
6
]])),
([
TWO_IMAGES
],
((
18
,
12
),
[[
6
,
12
]])),
([
TEXT_ONLY
,
TWO_IMAGES
],
((
22
,
12
),
[[
6
,
12
]]))])
def
test_get_cross_attention_mask
(
input_indices_and_output
)
->
None
:
input_indices
,
expected_output
=
input_indices_and_output
sequences
=
[
torch
.
tensor
(
prompt_data
[
i
])
for
i
in
input_indices
]
num_tiles
=
[[
2
,
2
]
if
i
!=
TEXT_ONLY
else
[]
for
i
in
input_indices
if
i
!=
TEXT_ONLY
]
input
=
torch
.
cat
(
sequences
)
seq_lens
=
[
len
(
s
)
for
s
in
sequences
]
attn_data
=
FlashAttentionMetadata
(
seq_lens
=
seq_lens
,
# Dummy values
enable_kv_scales_calculation
=
False
,
num_prefills
=
0
,
num_prefill_tokens
=
0
,
num_decode_tokens
=
0
,
slot_mapping
=
0
,
multi_modal_placeholder_index_maps
=
None
,
seq_lens_tensor
=
0
,
max_prefill_seq_len
=
0
,
max_decode_seq_len
=
0
,
context_lens_tensor
=
None
,
block_tables
=
None
,
use_cuda_graph
=
False
,
)
dummy
:
dict
[
str
,
str
]
=
{}
cross_attention_mask
,
kv_range_for_decode
=
MllamaForConditionalGeneration
\
.
get_cross_attention_mask
(
dummy
,
input
,
attn_data
,
num_tiles
=
num_tiles
,
num_tokens_per_tile
=
3
,
dtype
=
torch
.
bfloat16
)
expected_cross_attention_mask
,
expected_kv_range_for_decode
=
\
expected_output
assert
kv_range_for_decode
==
expected_kv_range_for_decode
if
expected_cross_attention_mask
is
not
None
:
assert
cross_attention_mask
is
not
None
assert
cross_attention_mask
.
shape
==
expected_cross_attention_mask
else
:
assert
cross_attention_mask
is
None
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"input_indices"
,
[[
TEXT_ONLY
],
[
IMAGE_AT_BEG
],
[
TEXT_ONLY
,
IMAGE_AT_BEG
],
[
IMAGE_AT_MIDDLE
],
[
TEXT_ONLY
,
IMAGE_AT_MIDDLE
],
[
TEXT_ONLY
,
IMAGE_AT_BEG
,
IMAGE_AT_MIDDLE
],
[
IMAGE_AT_MIDDLE
,
TEXT_ONLY
],
[
TWO_IMAGES
],
[
TEXT_ONLY
,
TWO_IMAGES
]])
def
test_get_full_text_row_masked_out_mask
(
input_indices
)
->
None
:
sequences
=
[
torch
.
tensor
(
prompt_data
[
i
])
for
i
in
input_indices
]
seq_lens
=
[
len
(
s
)
for
s
in
sequences
]
num_prefill_tokens
=
sum
(
seq_lens
)
# TEXT_ONLY is zero, so it will be masked out,
# other instances should not be.
encoder_seq_lens
=
[
int
(
i
)
for
i
in
input_indices
]
attn_data
=
FlashAttentionMetadata
(
seq_lens
=
seq_lens
,
encoder_seq_lens
=
encoder_seq_lens
,
num_prefill_tokens
=
num_prefill_tokens
,
# Dummy values
enable_kv_scales_calculation
=
False
,
num_prefills
=
0
,
num_decode_tokens
=
0
,
slot_mapping
=
0
,
multi_modal_placeholder_index_maps
=
None
,
seq_lens_tensor
=
0
,
max_prefill_seq_len
=
0
,
max_decode_seq_len
=
0
,
context_lens_tensor
=
None
,
block_tables
=
None
,
use_cuda_graph
=
False
,
)
dummy
:
dict
[
str
,
str
]
=
{}
full_text_row_masked_out_mask
=
MllamaForConditionalGeneration
\
.
get_full_text_row_masked_out_mask
(
dummy
,
attn_data
,
torch
.
get_default_device
())
full_text_row_masked_out_mask
=
full_text_row_masked_out_mask
.
squeeze
()
full_text_row_masked_out_mask
=
full_text_row_masked_out_mask
.
tolist
()
idx
=
0
assert
len
(
full_text_row_masked_out_mask
)
==
num_prefill_tokens
for
i
,
seq_len
in
enumerate
(
seq_lens
):
must_be_masked
=
input_indices
[
i
]
!=
TEXT_ONLY
for
_
in
range
(
seq_len
):
assert
full_text_row_masked_out_mask
[
idx
]
==
must_be_masked
,
\
f
"full_text_row_masked_out_mask[
{
idx
}
] must be "
\
f
"'
{
must_be_masked
}
' "
idx
+=
1
tests/models/multimodal/__init__.py
0 → 100644
View file @
afd0da21
tests/models/multimodal/processing/__init__.py
0 → 100644
View file @
afd0da21
tests/models/multimodal/processing/test_common.py
0 → 100644
View file @
afd0da21
from
functools
import
partial
import
numpy
as
np
import
pytest
from
PIL
import
Image
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
InputProcessingContext
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.processing
import
ProcessingCache
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
...registry
import
HF_EXAMPLE_MODELS
def
_test_processing_correctness
(
model_id
:
str
,
hit_rate
:
float
,
num_batches
:
int
,
simplify_rate
:
float
,
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_config
=
ModelConfig
(
model_id
,
task
=
"auto"
,
tokenizer
=
model_id
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
model_info
.
trust_remote_code
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
hf_overrides
=
model_info
.
hf_overrides
,
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
ctx
=
InputProcessingContext
(
model_config
,
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
model_info
.
trust_remote_code
,
),
)
# Ensure that it can fit all of the data
cache
=
ProcessingCache
(
capacity
=
1
<<
30
)
processing_info
=
factories
.
info
(
ctx
)
supported_mm_limits
=
processing_info
.
get_supported_mm_limits
()
limit_mm_per_prompt
=
{
modality
:
3
if
limit
is
None
else
limit
for
modality
,
limit
in
supported_mm_limits
.
items
()
}
model_config
.
get_multimodal_config
().
limit_per_prompt
=
limit_mm_per_prompt
baseline_processor
=
factories
.
build_processor
(
ctx
,
cache
=
None
)
cached_processor
=
factories
.
build_processor
(
ctx
,
cache
=
cache
)
dummy_inputs
=
baseline_processor
.
dummy_inputs
tokenizer
=
baseline_processor
.
info
.
get_tokenizer
()
rng
=
np
.
random
.
RandomState
(
0
)
input_to_hit
=
{
"image"
:
Image
.
new
(
"RGB"
,
size
=
(
128
,
128
)),
"video"
:
np
.
zeros
((
4
,
128
,
128
,
3
),
dtype
=
np
.
uint8
),
"audio"
:
(
np
.
zeros
((
512
,
)),
16000
),
}
input_factory
=
{
"image"
:
partial
(
random_image
,
rng
,
min_wh
=
128
,
max_wh
=
256
),
"video"
:
partial
(
random_video
,
rng
,
min_frames
=
2
,
max_frames
=
8
,
min_wh
=
128
,
max_wh
=
256
),
"audio"
:
partial
(
random_audio
,
rng
,
min_len
=
512
,
max_len
=
1024
,
sr
=
16000
),
}
for
batch_idx
in
range
(
num_batches
):
mm_data
=
{
k
:
[(
input_to_hit
[
k
]
if
rng
.
rand
()
<
hit_rate
else
input_factory
[
k
]())
for
_
in
range
(
rng
.
randint
(
limit
))]
for
k
,
limit
in
limit_mm_per_prompt
.
items
()
}
mm_counts
=
{
k
:
len
(
vs
)
for
k
,
vs
in
mm_data
.
items
()}
prompt
=
dummy_inputs
.
get_dummy_processor_inputs
(
model_config
.
max_model_len
,
mm_counts
,
).
prompt_text
# Drop unnecessary keys and test single -> multi conversion
if
rng
.
rand
()
<
simplify_rate
:
for
k
in
list
(
mm_data
.
keys
()):
if
not
mm_data
[
k
]:
del
mm_data
[
k
]
elif
len
(
mm_data
[
k
])
==
1
:
mm_data
[
k
]
=
mm_data
[
k
][
0
]
baseline_result
=
baseline_processor
.
apply
(
prompt
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
cached_result
=
cached_processor
.
apply
(
prompt
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
assert
baseline_result
==
cached_result
,
(
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
baseline_tokenized_result
=
baseline_processor
.
apply
(
tokenizer
.
encode
(
prompt
),
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
assert
baseline_result
==
baseline_tokenized_result
,
(
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
cached_tokenized_result
=
cached_processor
.
apply
(
tokenizer
.
encode
(
prompt
),
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
assert
cached_result
==
cached_tokenized_result
,
(
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
# yapf: disable
# True if the model supports multiple data items of the modality per request
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"rhymes-ai/Aria"
,
"Salesforce/blip2-opt-2.7b"
,
"facebook/chameleon-7b"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"adept/fuyu-8b"
,
"llava-hf/llava-1.5-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
"mistral-community/pixtral-12b"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"Qwen/Qwen-VL-Chat"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"fixie-ai/ultravox-v0_3"
,
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"simplify_rate"
,
[
1.0
])
# yapf: enable
def
test_processing_correctness
(
model_id
:
str
,
hit_rate
:
float
,
num_batches
:
int
,
simplify_rate
:
float
,
):
_test_processing_correctness
(
model_id
,
hit_rate
=
hit_rate
,
num_batches
=
num_batches
,
simplify_rate
=
simplify_rate
,
)
# yapf: disable
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"microsoft/Phi-3-vision-128k-instruct"
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"simplify_rate"
,
[
1.0
])
# yapf: enable
def
test_processing_correctness_phi3v
(
model_id
:
str
,
hit_rate
:
float
,
num_batches
:
int
,
simplify_rate
:
float
,
):
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from
transformers
import
AutoImageProcessor
# noqa: F401
from
transformers
import
AutoProcessor
# noqa: F401
AutoImageProcessor
.
from_pretrained
(
model_id
,
trust_remote_code
=
True
)
_test_processing_correctness
(
model_id
,
hit_rate
=
hit_rate
,
num_batches
=
num_batches
,
simplify_rate
=
simplify_rate
,
)
tests/models/
decoder_only/vision_language/mm_processor_kwargs
/test_idefics3.py
→
tests/models/
multimodal/processing
/test_idefics3.py
View file @
afd0da21
...
@@ -9,9 +9,9 @@ from transformers import AutoImageProcessor, AutoTokenizer
...
@@ -9,9 +9,9 @@ from transformers import AutoImageProcessor, AutoTokenizer
from
vllm.inputs
import
InputContext
,
token_inputs
from
vllm.inputs
import
InputContext
,
token_inputs
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal
import
MultiModalRegistry
from
....
.
conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...
.
utils
import
build_model_context
from
...utils
import
build_model_context
from
....
.
utils
import
models_path_prefix
from
....utils
import
models_path_prefix
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
)]
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
)]
...
...
tests/models/
decoder_only/vision_language/mm_processor_kwargs
/test_internvl.py
→
tests/models/
multimodal/processing
/test_internvl.py
View file @
afd0da21
...
@@ -8,9 +8,9 @@ from transformers import AutoTokenizer
...
@@ -8,9 +8,9 @@ from transformers import AutoTokenizer
from
vllm.inputs
import
InputContext
,
token_inputs
from
vllm.inputs
import
InputContext
,
token_inputs
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal
import
MultiModalRegistry
from
....
.
conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...
.
utils
import
build_model_context
from
...utils
import
build_model_context
from
....
.
utils
import
models_path_prefix
from
....utils
import
models_path_prefix
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)]
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)]
...
...
tests/models/multimodal/processing/test_llava_next.py
0 → 100644
View file @
afd0da21
import
itertools
from
functools
import
partial
import
pytest
from
PIL
import
Image
from
pqdm.threads
import
pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
...utils
import
build_model_context
def
_validate_image_max_tokens_one
(
processor
:
BaseMultiModalProcessor
,
max_tokens
:
int
,
failed_size_excs
:
list
[
tuple
[
ImageSize
,
Exception
]],
image_size
:
ImageSize
,
)
->
None
:
info
=
processor
.
info
feature_size
=
info
.
get_num_image_tokens
(
image_width
=
image_size
.
width
,
image_height
=
image_size
.
height
)
try
:
assert
feature_size
<=
max_tokens
,
f
"
{
feature_size
}
<=
{
max_tokens
}
"
except
Exception
as
exc
:
failed_size_excs
.
append
((
image_size
,
exc
))
@
pytest
.
mark
.
skip
(
"This test takes around 5 minutes to run. "
"Comment this out to run it manually."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
def
test_processor_max_tokens
(
model_id
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
)
info
=
processor
.
info
seen_aspect_ratios
=
set
[
float
]()
image_sizes
=
list
[
ImageSize
]()
# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for
w
,
h
in
itertools
.
product
(
range
(
32
,
4096
),
repeat
=
2
):
aspect_ratio
=
w
/
h
if
1
<=
aspect_ratio
<=
2
and
aspect_ratio
not
in
seen_aspect_ratios
:
image_sizes
.
append
(
ImageSize
(
w
,
h
))
seen_aspect_ratios
.
add
(
aspect_ratio
)
failed_size_excs
=
list
[
tuple
[
ImageSize
,
Exception
]]()
validate_one
=
partial
(
_validate_image_max_tokens_one
,
processor
,
info
.
get_max_image_tokens
(),
# type: ignore
failed_size_excs
,
)
pqdm
(
image_sizes
,
validate_one
,
n_jobs
=
8
,
desc
=
"Validating image sizes"
)
if
failed_size_excs
:
msg
=
"Found failing image sizes:"
\
+
"
\n
========
\n
"
.
join
(
f
"[
{
size
}
]
\n
{
exc
}
"
for
size
,
exc
in
failed_size_excs
)
raise
AssertionError
(
msg
)
def
_validate_image_prompt_replacements_one
(
processor
:
BaseMultiModalProcessor
,
num_imgs
:
int
,
failed_size_excs
:
list
[
tuple
[
ImageSize
,
Exception
]],
image_size
:
ImageSize
,
)
->
None
:
prompt
=
"<image>"
*
num_imgs
image
=
Image
.
new
(
"RGB"
,
size
=
image_size
)
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
try
:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
{})
image_placeholders
=
processed_inputs
[
"mm_placeholders"
][
"image"
]
assert
len
(
image_placeholders
)
==
num_imgs
first_placeholder
=
image_placeholders
[
0
]
# NOTE: There is a BOS token
assert
first_placeholder
[
"offset"
]
==
1
assert
first_placeholder
[
"length"
]
==
(
len
(
processed_inputs
[
"prompt_token_ids"
])
-
1
)
//
num_imgs
except
Exception
as
exc
:
failed_size_excs
.
append
((
image_size
,
exc
))
def
_test_image_prompt_replacements
(
processor
,
*
,
num_imgs
:
int
,
image_sizes
:
list
[
ImageSize
],
)
->
None
:
"""
Ensure LlavaNextMultiModalProcessor
handles prompt replacement properly for input images.
"""
failed_size_excs
=
list
[
tuple
[
ImageSize
,
Exception
]]()
validate_one
=
partial
(
_validate_image_prompt_replacements_one
,
processor
,
num_imgs
,
failed_size_excs
,
)
pqdm
(
image_sizes
,
validate_one
,
n_jobs
=
8
,
desc
=
"Validating image sizes"
)
if
failed_size_excs
:
msg
=
"Found failing image sizes:"
\
+
"
\n
========
\n
"
.
join
(
f
"[
{
size
}
]
\n
{
exc
}
"
for
size
,
exc
in
failed_size_excs
)
raise
AssertionError
(
msg
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_prompt_replacements_regression
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
(
488
,
183
),
(
2560
,
1669
)]
image_sizes
=
[
size
for
w
,
h
in
image_ratios
for
size
in
[
ImageSize
(
w
,
h
),
ImageSize
(
h
,
w
)]
]
_test_image_prompt_replacements
(
processor
,
num_imgs
=
num_imgs
,
image_sizes
=
image_sizes
,
)
@
pytest
.
mark
.
skip
(
"This test takes around 2 hours to run. "
"Comment this out to run it manually."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
])
def
test_processor_prompt_replacements_all
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
)
seen_aspect_ratios
=
set
[
float
]()
image_sizes
=
list
[
ImageSize
]()
# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for
w
,
h
in
itertools
.
product
(
range
(
64
,
1024
),
repeat
=
2
):
aspect_ratio
=
w
/
h
if
1
<=
aspect_ratio
<=
2
and
aspect_ratio
not
in
seen_aspect_ratios
:
image_sizes
.
append
(
ImageSize
(
w
,
h
))
seen_aspect_ratios
.
add
(
aspect_ratio
)
_test_image_prompt_replacements
(
processor
,
num_imgs
=
num_imgs
,
image_sizes
=
image_sizes
,
)
tests/models/multimodal/processing/test_llava_onevision.py
0 → 100644
View file @
afd0da21
import
itertools
from
functools
import
partial
import
pytest
from
PIL
import
Image
from
pqdm.threads
import
pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
...utils
import
build_model_context
def
_validate_image_max_tokens_one
(
processor
:
BaseMultiModalProcessor
,
max_tokens
:
int
,
failed_size_excs
:
list
[
tuple
[
ImageSize
,
Exception
]],
image_size
:
ImageSize
,
)
->
None
:
info
=
processor
.
info
feature_size
=
info
.
get_num_image_tokens
(
image_width
=
image_size
.
width
,
image_height
=
image_size
.
height
)
try
:
assert
feature_size
<=
max_tokens
,
f
"
{
feature_size
}
<=
{
max_tokens
}
"
except
Exception
as
exc
:
failed_size_excs
.
append
((
image_size
,
exc
))
@
pytest
.
mark
.
skip
(
"This test takes around 5 minutes to run. "
"Comment this out to run it manually."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
])
def
test_processor_max_tokens
(
model_id
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
)
info
=
processor
.
info
seen_aspect_ratios
=
set
[
float
]()
image_sizes
=
list
[
ImageSize
]()
# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for
w
,
h
in
itertools
.
product
(
range
(
32
,
4096
),
repeat
=
2
):
aspect_ratio
=
w
/
h
if
1
<=
aspect_ratio
<=
6
and
aspect_ratio
not
in
seen_aspect_ratios
:
image_sizes
.
append
(
ImageSize
(
w
,
h
))
seen_aspect_ratios
.
add
(
aspect_ratio
)
failed_size_excs
=
list
[
tuple
[
ImageSize
,
Exception
]]()
validate_one
=
partial
(
_validate_image_max_tokens_one
,
processor
,
info
.
get_max_image_tokens
(),
# type: ignore
failed_size_excs
,
)
pqdm
(
image_sizes
,
validate_one
,
n_jobs
=
8
,
desc
=
"Validating image sizes"
)
if
failed_size_excs
:
msg
=
"Found failing image sizes:"
\
+
"
\n
========
\n
"
.
join
(
f
"[
{
size
}
]
\n
{
exc
}
"
for
size
,
exc
in
failed_size_excs
)
raise
AssertionError
(
msg
)
def
_validate_image_prompt_replacements_one
(
processor
:
BaseMultiModalProcessor
,
num_imgs
:
int
,
failed_size_excs
:
list
[
tuple
[
ImageSize
,
Exception
]],
image_size
:
ImageSize
,
)
->
None
:
prompt
=
"<image>"
*
num_imgs
image
=
Image
.
new
(
"RGB"
,
size
=
image_size
)
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
try
:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
{})
image_placeholders
=
processed_inputs
[
"mm_placeholders"
][
"image"
]
assert
len
(
image_placeholders
)
==
num_imgs
first_placeholder
=
image_placeholders
[
0
]
assert
first_placeholder
[
"offset"
]
==
0
assert
first_placeholder
[
"length"
]
==
len
(
processed_inputs
[
"prompt_token_ids"
])
//
num_imgs
except
Exception
as
exc
:
failed_size_excs
.
append
((
image_size
,
exc
))
def
_test_image_prompt_replacements
(
processor
,
*
,
num_imgs
:
int
,
image_sizes
:
list
[
ImageSize
],
)
->
None
:
"""
Ensure LlavaOnevisionMultiModalProcessor
handles prompt replacement properly for input images.
"""
failed_size_excs
=
list
[
tuple
[
ImageSize
,
Exception
]]()
validate_one
=
partial
(
_validate_image_prompt_replacements_one
,
processor
,
num_imgs
,
failed_size_excs
,
)
pqdm
(
image_sizes
,
validate_one
,
n_jobs
=
8
,
desc
=
"Validating image sizes"
)
if
failed_size_excs
:
msg
=
"Found failing image sizes:"
\
+
"
\n
========
\n
"
.
join
(
f
"[
{
size
}
]
\n
{
exc
}
"
for
size
,
exc
in
failed_size_excs
)
raise
AssertionError
(
msg
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_prompt_replacements_regression
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
(
488
,
183
),
(
2560
,
1669
)]
image_sizes
=
[
size
for
w
,
h
in
image_ratios
for
size
in
[
ImageSize
(
w
,
h
),
ImageSize
(
h
,
w
)]
]
_test_image_prompt_replacements
(
processor
,
num_imgs
=
num_imgs
,
image_sizes
=
image_sizes
,
)
@
pytest
.
mark
.
skip
(
"This test takes around 2 hours to run. "
"Comment this out to run it manually."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
])
def
test_processor_prompt_replacements_all
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
)
seen_aspect_ratios
=
set
[
float
]()
image_sizes
=
list
[
ImageSize
]()
# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for
w
,
h
in
itertools
.
product
(
range
(
64
,
1024
),
repeat
=
2
):
aspect_ratio
=
w
/
h
if
1
<=
aspect_ratio
<=
6
and
aspect_ratio
not
in
seen_aspect_ratios
:
image_sizes
.
append
(
ImageSize
(
w
,
h
))
seen_aspect_ratios
.
add
(
aspect_ratio
)
_test_image_prompt_replacements
(
processor
,
num_imgs
=
num_imgs
,
image_sizes
=
image_sizes
,
)
Prev
1
…
19
20
21
22
23
24
25
26
27
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment