Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc98f1e0
Unverified
Commit
cc98f1e0
authored
Oct 30, 2024
by
Alex Brooks
Committed by
GitHub
Oct 30, 2024
Browse files
[CI/Build] VLM Test Consolidation (#9372)
Signed-off-by:
Alex-Brooks
<
Alex.Brooks@ibm.com
>
parent
211fe91a
Changes
37
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
1912 additions
and
833 deletions
+1912
-833
tests/models/decoder_only/vision_language/test_llava_onevision.py
...dels/decoder_only/vision_language/test_llava_onevision.py
+0
-272
tests/models/decoder_only/vision_language/test_minicpmv.py
tests/models/decoder_only/vision_language/test_minicpmv.py
+0
-199
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+594
-0
tests/models/decoder_only/vision_language/test_paligemma.py
tests/models/decoder_only/vision_language/test_paligemma.py
+0
-174
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+7
-178
tests/models/decoder_only/vision_language/vlm_utils/__init__.py
...models/decoder_only/vision_language/vlm_utils/__init__.py
+0
-0
tests/models/decoder_only/vision_language/vlm_utils/builders.py
...models/decoder_only/vision_language/vlm_utils/builders.py
+235
-0
tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
.../decoder_only/vision_language/vlm_utils/case_filtering.py
+157
-0
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+141
-0
tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
...s/decoder_only/vision_language/vlm_utils/custom_inputs.py
+102
-0
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+338
-0
tests/models/decoder_only/vision_language/vlm_utils/runners.py
.../models/decoder_only/vision_language/vlm_utils/runners.py
+130
-0
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
+187
-0
tests/models/embedding/vision_language/test_llava_next.py
tests/models/embedding/vision_language/test_llava_next.py
+2
-0
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+1
-1
tests/utils.py
tests/utils.py
+16
-8
vllm/utils.py
vllm/utils.py
+2
-1
No files found.
tests/models/decoder_only/vision_language/test_llava_onevision.py
deleted
100644 → 0
View file @
211fe91a
from
typing
import
List
,
Optional
,
Tuple
,
Type
import
pytest
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
)
from
vllm.multimodal.utils
import
(
rescale_image_size
,
rescale_video_size
,
resize_video
,
sample_frames_from_video
)
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
(
VIDEO_ASSETS
,
HfRunner
,
PromptImageInput
,
PromptVideoInput
,
VllmRunner
)
from
...utils
import
check_logprobs_close
# Video test
HF_VIDEO_PROMPTS
=
VIDEO_ASSETS
.
prompts
({
"sample_demo_1"
:
"<|im_start|>user
\n
<video>
\n
why is this video funny?<|im_end|>
\n
<|im_start|>assistant
\n
"
# noqa: E501
})
models
=
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
config
=
AutoConfig
.
from_pretrained
(
model
)
video_token_id
=
config
.
video_token_index
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
hf_output_ids
=
[
token_id
for
idx
,
token_id
in
enumerate
(
output_ids
)
if
token_id
!=
video_token_id
or
output_ids
[
idx
-
1
]
!=
video_token_id
]
hf_output_str
=
output_str
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
# Video test
_LIMIT_VIDEO_PER_PROMPT
=
4
def
run_video_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
inputs
:
List
[
Tuple
[
List
[
str
],
PromptVideoInput
]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_frames
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
16384
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"video"
:
_LIMIT_VIDEO_PER_PROMPT
})
as
vllm_model
:
vllm_outputs_per_input
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
videos
=
videos
)
for
prompts
,
videos
in
inputs
]
def
process
(
hf_inputs
:
BatchEncoding
):
hf_inputs
[
"pixel_values_videos"
]
=
hf_inputs
[
"pixel_values_videos"
]
\
.
to
(
torch_dtype
)
# type: ignore
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
hf_outputs_per_input
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
videos
=
videos
)
for
prompts
,
videos
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_input
,
vllm_outputs_per_input
):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
16
])
def
test_models_multiple_video_inputs
(
hf_runner
,
vllm_runner
,
video_assets
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
num_frames
)
->
None
:
video
=
sample_frames_from_video
(
video_assets
[
0
].
np_ndarrays
,
num_frames
)
inputs
=
[(
[
"<|im_start|>user <video><video>
\n
Describe 2 videos.
\
<|im_end|><|im_start|>assistant
\n
"
,
"<|im_start|>user <video><video>
\n
Describe 2 videos.
\
<|im_end|><|im_start|>assistant
\n
"
,
"<|im_start|>user <video><video><video><video>
\n
Describe 4 videos.
\
<|im_end|><|im_start|>assistant
\n
"
,
"<|im_start|>user <video>
\n
why is this video funny?
\
<|im_end|><|im_start|>assistant
\n
"
,
],
[
[
video
,
video
],
# Images with different sizes and aspect-ratios
[
rescale_video_size
(
video
,
0.1
),
video
,
],
[
video
,
rescale_video_size
(
video
,
0.25
),
resize_video
(
video
,
(
183
,
488
)),
resize_video
(
video
,
(
488
,
183
))
],
video
,
])]
run_video_test
(
hf_runner
,
vllm_runner
,
inputs
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
num_frames
=
num_frames
,
)
# Image test
_LIMIT_IMAGE_PER_PROMPT
=
4
def
run_image_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
inputs
:
List
[
Tuple
[
List
[
str
],
PromptImageInput
]],
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
16384
,
max_num_seqs
=
2
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
_LIMIT_IMAGE_PER_PROMPT
})
as
vllm_model
:
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
]
def
process
(
hf_inputs
:
BatchEncoding
):
hf_inputs
[
"pixel_values"
]
=
hf_inputs
[
"pixel_values"
]
\
.
to
(
torch_dtype
)
# type: ignore
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_multiple_image_inputs
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
stop_sign
=
image_assets
[
0
].
pil_image
cherry_blossom
=
image_assets
[
1
].
pil_image
inputs
=
[(
[
"<|im_start|>user
\n
<image><image>
\n
Describe 2 images.<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
"<|im_start|>user
\n
<image><image>
\n
Describe 2 images.<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
"<|im_start|>user
\n
<image><image><image><image>
\n
Describe 4 images.<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
"<|im_start|>user
\n
<image>
\n
What is the season?<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
],
[
[
stop_sign
,
cherry_blossom
],
# Images with different sizes and aspect-ratios
[
rescale_image_size
(
stop_sign
,
0.1
),
stop_sign
,
],
[
stop_sign
,
rescale_image_size
(
stop_sign
,
0.25
),
cherry_blossom
.
resize
((
183
,
488
)),
cherry_blossom
.
resize
((
488
,
183
))
],
cherry_blossom
,
])]
run_image_test
(
hf_runner
,
vllm_runner
,
inputs
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/test_minicpmv.py
deleted
100644 → 0
View file @
211fe91a
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
Union
import
pytest
import
torch
import
torch.types
from
PIL
import
Image
from
transformers
import
BatchEncoding
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
from
...utils
import
check_logprobs_close
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
"
\
"(<image>./</image>)
\n
What's the content of the image?<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
"cherry_blossom"
:
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
"
\
"(<image>./</image>)
\n
What is the season?<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
})
HF_MULTIIMAGE_IMAGE_PROMPT
=
\
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
"
\
"(<image>./</image>)
\n
(<image>./</image>)
\n
"
\
"Describe these images.<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
models
=
[
"openbmb/MiniCPM-Llama3-V-2_5"
]
def
_wrap_inputs
(
hf_inputs
:
BatchEncoding
):
return
{
"model_inputs"
:
hf_inputs
}
def
trunc_hf_output
(
hf_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]):
output_ids
,
output_str
,
out_logprobs
=
hf_output
if
output_str
.
endswith
(
"<|eot_id|>"
):
output_str
=
output_str
.
split
(
"<|eot_id|>"
)[
0
]
return
output_ids
,
output_str
,
out_logprobs
target_dtype
=
"half"
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
inputs
:
List
[
Tuple
[
List
[
str
],
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stop_token_ids
=
[
tokenizer
.
eos_id
,
tokenizer
.
eot_id
]
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
stop_token_ids
=
stop_token_ids
)
for
prompts
,
images
in
inputs
]
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
_wrap_inputs
)
with
hf_model
,
torch
.
no_grad
():
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
tokenizer
=
tokenizer
)
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
check_logprobs_close
(
outputs_0_lst
=
[
trunc_hf_output
(
hf_output
)
for
hf_output
in
hf_outputs
],
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
])
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_case
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/test_models.py
0 → 100644
View file @
cc98f1e0
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
import
os
from
pathlib
import
PosixPath
from
typing
import
Type
import
pytest
import
transformers
from
transformers
import
AutoModelForVision2Seq
from
vllm.platforms
import
current_platform
from
vllm.utils
import
cuda_device_count_stateless
,
identity
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
,
_VideoAssets
)
from
....utils
import
fork_new_process_for_each_test
,
large_gpu_mark
from
...utils
import
check_outputs_equal
from
.vlm_utils
import
custom_inputs
,
model_utils
,
runners
from
.vlm_utils.case_filtering
import
get_parametrized_options
from
.vlm_utils.types
import
(
CustomTestOptions
,
ExpandableVLMTestArgs
,
VLMTestInfo
,
VLMTestType
)
# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if
current_platform
.
is_rocm
():
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
# yapf: disable
COMMON_BROADCAST_SETTINGS
=
{
"test_type"
:
VLMTestType
.
IMAGE
,
"dtype"
:
"half"
,
"max_tokens"
:
5
,
"tensor_parallel_size"
:
2
,
"image_size_factors"
:
[(.
25
,
0.5
,
1.0
)],
"distributed_executor_backend"
:
(
"ray"
,
"mp"
,
)
}
### Test configuration for specific models
# NOTE: The convention of the test settings below is to lead each test key
# with the name of the model arch used in the test, using underscores in place
# of hyphens; this makes it more convenient to filter tests for a specific kind
# of model. For example....
#
# To run all test types for a specific key:
# use the k flag to substring match with a leading square bracket; if the
# model arch happens to be a substring of another one, you can add a
# trailing hyphen. E.g.,
# - pytest $TEST_FILE -k "[llava-"
# prevents matching on "[llava_next-" & will match just the enabled cases
# for llava, i.e., single image, image embedding, and custom input tests.
#
# To run a test for a Test Info for just one of multiple models:
# use the k flag to substring match the model name, e.g.,
# - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
# prevents matching on nGVLab/InternVL2-2B.
#
# You can also combine substrings to match more granularly.
# ex 1:
# pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
# will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
# match both wrappers for single image tests, since it also matches
# test_single_image_heavy (which forks if we have a distributed backend)
# ex 2:
# pytest $TEST_FILE -k "[llava- or [intern_vl-"
# will run all of the tests for only llava & internvl.
#
# NOTE you can add --collect-only to any of the above commands to see
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.
VLM_TEST_SETTINGS
=
{
"blip2"
:
VLMTestInfo
(
models
=
[
"Salesforce/blip2-opt-2.7b"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"Question:
{
img_prompt
}
Answer:"
,
img_idx_to_prompt
=
lambda
idx
:
""
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
blip2_vllm_to_hf_output
,
),
"chameleon"
:
VLMTestInfo
(
models
=
[
"facebook/chameleon-7b"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
postprocess_inputs
=
model_utils
.
get_key_type_post_processor
(
"pixel_values"
),
# For chameleon, we only compare the sequences
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
comparator
=
check_outputs_equal
,
max_tokens
=
8
,
dtype
=
"bfloat16"
,
marks
=
[
pytest
.
mark
.
skipif
(
transformers
.
__version__
.
startswith
(
"4.46"
),
reason
=
"Model broken in HF, see huggingface/transformers#34379"
)
]
),
"fuyu"
:
VLMTestInfo
(
models
=
[
"adept/fuyu-8b"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"
{
img_prompt
}
\n
"
,
img_idx_to_prompt
=
lambda
idx
:
""
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
use_tokenizer_eos
=
True
,
vllm_output_post_proc
=
model_utils
.
fuyu_vllm_to_hf_output
,
num_logprobs
=
10
,
dtype
=
"bfloat16"
if
current_platform
.
is_cpu
()
else
"half"
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
),
"glm4"
:
VLMTestInfo
(
models
=
[
"THUDM/glm-4v-9b"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
identity
,
img_idx_to_prompt
=
lambda
idx
:
""
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
dtype
=
"bfloat16"
,
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
patch_hf_runner
=
model_utils
.
glm_patch_hf_runner
,
),
"intern_vl"
:
VLMTestInfo
(
models
=
[
"OpenGVLab/InternVL2-1B"
,
"OpenGVLab/InternVL2-2B"
,
"OpenGVLab/Mono-InternVL-2B"
,
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>Assistant
\n
"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>
\n
What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<image>
\n
What is the season?"
,
}),
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
max_model_len
=
4096
,
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
dtype
=
"bfloat16"
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
),
"llava"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-1.5-7b-hf"
],
test_type
=
(
VLMTestType
.
EMBEDDING
,
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
convert_assets_to_embeddings
=
model_utils
.
get_llava_embeddings
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
),
limit_mm_per_prompt
=
{
"image"
:
4
},
)],
),
"llava_next"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
),
limit_mm_per_prompt
=
{
"image"
:
4
},
)],
# Llava-next tests fixed sizes & the default size factors
image_sizes
=
[((
1669
,
2560
),
(
2560
,
1669
),
(
183
,
488
),
(
488
,
183
))],
),
"llava_one_vision"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
],
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
dtype
=
"half"
,
num_video_frames
=
16
,
max_model_len
=
16384
,
postprocess_inputs
=
model_utils
.
get_key_type_post_processor
(
"pixel_values_videos"
),
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
# Llava-one-vision tests fixed sizes & the default size factors
image_sizes
=
[((
1669
,
2560
),
(
2560
,
1669
),
(
183
,
488
),
(
488
,
183
))],
runner_mm_key
=
"videos"
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_video_multi_aspect_ratio_inputs
(
formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
),
limit_mm_per_prompt
=
{
"video"
:
4
},
)],
),
# FIXME
"llava_next_video"
:
VLMTestInfo
(
models
=
[
"llava-hf/LLaVA-NeXT-Video-7B-hf"
],
test_type
=
VLMTestType
.
VIDEO
,
prompt_formatter
=
lambda
vid_prompt
:
f
"USER:
{
vid_prompt
}
ASSISTANT:"
,
num_video_frames
=
16
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_video_vllm_to_hf_output
,
image_sizes
=
[((
1669
,
2560
),
(
2560
,
1669
),
(
183
,
488
),
(
488
,
183
))],
runner_mm_key
=
"videos"
,
marks
=
[
pytest
.
mark
.
skip
(
reason
=
"LLava next video tests currently fail."
)
],
),
"minicpmv"
:
VLMTestInfo
(
models
=
[
"openbmb/MiniCPM-Llama3-V-2_5"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
{
img_prompt
}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"(<image>./</image>)
\n
"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
[
tok
.
eos_id
,
tok
.
eot_id
],
postprocess_inputs
=
model_utils
.
wrap_inputs_post_processor
,
hf_output_post_proc
=
model_utils
.
minicmpv_trunc_hf_output
,
),
"paligemma"
:
VLMTestInfo
(
models
=
[
"google/paligemma-3b-mix-224"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
identity
,
img_idx_to_prompt
=
lambda
idx
:
""
,
# Paligemma uses its own sample prompts because the default one fails
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"caption es"
,
"cherry_blossom"
:
"What is in the picture?"
,
}),
auto_cls
=
AutoModelForVision2Seq
,
postprocess_inputs
=
model_utils
.
get_key_type_post_processor
(
"pixel_values"
),
vllm_output_post_proc
=
model_utils
.
paligemma_vllm_to_hf_output
,
dtype
=
"half"
if
current_platform
.
is_rocm
()
else
(
"half"
,
"float"
),
),
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
# https://github.com/huggingface/transformers/issues/34307
# "phi3v": VLMTestInfo(
# models=["microsoft/Phi-3.5-vision-instruct"],
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
# prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
# img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
# max_model_len=4096,
# max_num_seqs=2,
# task="generate",
# # use eager mode for hf runner since phi3v didn't work with flash_attn
# model_kwargs={"_attn_implementation": "eager"},
# use_tokenizer_eos=True,
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
# num_logprobs=10,
# ),
"qwen"
:
VLMTestInfo
(
models
=
[
"Qwen/Qwen-VL"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
identity
,
img_idx_to_prompt
=
lambda
idx
:
f
"Picture
{
idx
}
: <img></img>
\n
"
,
max_model_len
=
1024
,
max_num_seqs
=
2
,
vllm_output_post_proc
=
model_utils
.
qwen_vllm_to_hf_output
,
prompt_path_encoder
=
model_utils
.
qwen_prompt_path_encoder
,
),
### Tensor parallel / multi-gpu broadcast tests
"broadcast-chameleon"
:
VLMTestInfo
(
models
=
[
"facebook/chameleon-7b"
],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
postprocess_inputs
=
model_utils
.
get_key_type_post_processor
(
"pixel_values"
),
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
comparator
=
check_outputs_equal
,
marks
=
[
pytest
.
mark
.
distributed_2_gpus
,
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
,
),
pytest
.
mark
.
skipif
(
transformers
.
__version__
.
startswith
(
"4.46"
),
reason
=
"Model broken in HF, see huggingface/transformers#34379"
)
],
**
COMMON_BROADCAST_SETTINGS
# type: ignore
),
"broadcast-llava"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-1.5-7b-hf"
],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
[
pytest
.
mark
.
distributed_2_gpus
,
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
,
)
],
**
COMMON_BROADCAST_SETTINGS
# type: ignore
),
"broadcast-llava_next"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
[
pytest
.
mark
.
distributed_2_gpus
,
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
,
)
],
**
COMMON_BROADCAST_SETTINGS
# type: ignore
),
### Custom input edge-cases for specific models
"intern_vl-diff-patches"
:
VLMTestInfo
(
models
=
[
"OpenGVLab/InternVL2-2B"
],
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>Assistant
\n
"
,
# noqa: E501
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
4096
,
dtype
=
"bfloat16"
if
current_platform
.
is_cpu
()
else
"half"
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
inp
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
for
inp
in
custom_inputs
.
different_patch_input_cases_internvl
()
],
),
"llava_one_vision-multiple-images"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
],
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
16384
,
max_num_seqs
=
2
,
dtype
=
"half"
,
postprocess_inputs
=
model_utils
.
get_key_type_post_processor
(
"pixel_values"
),
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
),
limit_mm_per_prompt
=
{
"image"
:
4
},
)],
),
}
# yapf: enable
### Test wrappers
# Wrappers around the core test running func for:
# - single image
# - multi-image
# - image embeddings
# - video
# - custom inputs
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
IMAGE
,
fork_new_process_for_each_test
=
False
,
))
def
test_single_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_single_image_test
(
tmp_path
=
tmp_path
,
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
image_assets
=
image_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
fork_new_process_for_each_test
=
False
,
))
def
test_multi_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_multi_image_test
(
tmp_path
=
tmp_path
,
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
image_assets
=
image_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
EMBEDDING
,
fork_new_process_for_each_test
=
False
,
))
def
test_image_embedding_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_embedding_test
(
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
image_assets
=
image_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
VIDEO
,
fork_new_process_for_each_test
=
False
,
))
def
test_video_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
video_assets
:
_VideoAssets
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_video_test
(
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
video_assets
=
video_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
fork_new_process_for_each_test
=
False
,
))
def
test_custom_inputs_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_custom_inputs_test
(
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
)
#### Tests filtering for things running each test as a new process
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
IMAGE
,
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
def
test_single_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_single_image_test
(
tmp_path
=
tmp_path
,
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
image_assets
=
image_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
def
test_multi_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_multi_image_test
(
tmp_path
=
tmp_path
,
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
image_assets
=
image_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
EMBEDDING
,
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
def
test_image_embedding_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_embedding_test
(
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
image_assets
=
image_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
VIDEO
,
fork_new_process_for_each_test
=
True
,
))
def
test_video_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
video_assets
:
_VideoAssets
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_video_test
(
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
video_assets
=
video_assets
,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
def
test_custom_inputs_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
):
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_custom_inputs_test
(
model_test_info
=
model_test_info
,
test_case
=
test_case
,
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
)
tests/models/decoder_only/vision_language/test_paligemma.py
deleted
100644 → 0
View file @
211fe91a
import
os
from
typing
import
List
,
Optional
,
Tuple
,
Type
import
pytest
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
)
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
...utils
import
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"caption es"
,
"cherry_blossom"
:
"What is in the picture?"
,
})
models
=
[
"google/paligemma-3b-mix-224"
]
# ROCm Triton FA can run into compilation issues with these models due to,
# excessive use of shared memory. Use other backends in the meantime.
# FIXME (mattwong, gshtrasb, hongxiayan)
if
current_platform
.
is_rocm
():
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
config
=
AutoConfig
.
from_pretrained
(
model
)
image_token_id
=
config
.
image_token_index
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
hf_output_ids
=
[
token_id
for
idx
,
token_id
in
enumerate
(
output_ids
)
if
token_id
!=
image_token_id
or
output_ids
[
idx
-
1
]
!=
image_token_id
]
hf_output_str
=
output_str
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs_per_image
]
def
process
(
hf_inputs
:
BatchEncoding
):
hf_inputs
[
"pixel_values"
]
=
hf_inputs
[
"pixel_values"
]
\
.
to
(
torch_dtype
)
# type: ignore
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs_per_image
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
pytest
.
param
(
"float"
,
marks
=
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"ROCm FA does not yet fully support 32-bit precision on PaliGemma"
)
),
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/test_phi3v.py
View file @
cc98f1e0
...
...
@@ -3,19 +3,14 @@ import re
from
typing
import
List
,
Optional
,
Tuple
,
Type
import
pytest
import
torch
from
transformers
import
AutoImageProcessor
,
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm.inputs
import
InputContext
,
token_inputs
from
vllm.model_executor.models.phi3v
import
_IMAGE_TOKEN_ID
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
...utils
import
build_model_context
,
check_logprobs_close
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
...utils
import
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
...
...
@@ -81,12 +76,15 @@ def run_test(
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from
transformers
import
AutoImageProcessor
# noqa: F401
from
transformers
import
AutoProcessor
# noqa: F401
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
task
=
"generate"
,
...
...
@@ -236,172 +234,3 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
### Fast tests for correctness in processor_kwarg override handling
# Wrap lazy imports to avoid initializing CUDA during test collection
@
pytest
.
fixture
()
def
input_processor_for_phi3v
():
from
vllm.model_executor.models.phi3v
import
input_processor_for_phi3v
return
input_processor_for_phi3v
@
pytest
.
fixture
()
def
dummy_data_for_phi3v
():
from
vllm.model_executor.models.phi3v
import
dummy_data_for_phi3v
return
dummy_data_for_phi3v
@
pytest
.
fixture
()
def
get_max_phi3v_image_tokens
():
from
vllm.model_executor.models.phi3v
import
get_max_phi3v_image_tokens
return
get_max_phi3v_image_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops"
,
[
4
,
16
,
None
])
def
test_input_mapper_override
(
model
:
str
,
image_assets
:
_ImageAssets
,
num_crops
:
Optional
[
int
]):
"""Ensure that the [default] input mapper handles num_crops properly."""
# We pass the processor kwargs here since for this model, we fall back to
# the default mapper; this will fall back to the HF mapper and forward
# mm_processor_kwargs to it.
mm_processor_kwargs
=
{
"num_crops"
:
num_crops
}
if
num_crops
is
not
None
else
{}
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
hf_processor
=
AutoImageProcessor
.
from_pretrained
(
model
,
trust_remote_code
=
True
,
**
mm_processor_kwargs
)
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
image
=
image_assets
[
0
].
pil_image
hf_result
=
hf_processor
.
preprocess
(
image
,
return_tensors
=
"pt"
,
)
vllm_result
=
mm_registry
.
map_input
(
ctx
.
model_config
,
{
"image"
:
image
},
)
assert
torch
.
all
(
hf_result
[
"image_sizes"
]
==
vllm_result
[
"image_sizes"
])
assert
torch
.
all
(
hf_result
[
"num_img_tokens"
]
==
vllm_result
[
"num_img_tokens"
])
# For pixel values, the second axis should be the num_crops + 1
# for the rescaled original image. The default value in VLLM falls
# back to the HF config, which is why we compare to the processor num_crops
assert
torch
.
all
(
hf_result
[
"pixel_values"
]
==
vllm_result
[
"pixel_values"
])
assert
vllm_result
[
"pixel_values"
].
shape
[
1
]
==
hf_processor
.
num_crops
+
1
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops,expected_max_tokens"
,
[
(
4
,
781
),
(
16
,
2653
),
])
def
test_max_tokens_override
(
get_max_phi3v_image_tokens
,
model
:
str
,
num_crops
:
int
,
expected_max_tokens
:
int
):
"""Ensure get_max_phi3v_image_tokens handles num_crops properly."""
# NOTE: mm_processor_kwargs on the context in this test is unused, since
# this is testing the mapper directly. In practice, the processor kwargs
# are wrapped in a closure when calling the max tokens func. We explicitly
# do NOT use the mm_processor_kwargs in the model context here to ensure
# that the max image tokens implementation is referencing a mix of the
# kwargs to the function and the original mm_processor_kwargs in case
# values are somehow updated and end up in a bad state.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
actual_max_tokens
=
get_max_phi3v_image_tokens
(
InputContext
(
ctx
.
model_config
),
num_crops
=
num_crops
,
)
assert
expected_max_tokens
==
actual_max_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops,toks_per_img,num_imgs"
,
[
(
4
,
781
,
1
),
(
4
,
781
,
2
),
(
16
,
2653
,
1
),
(
16
,
2653
,
2
),
])
def
test_dummy_data_override
(
dummy_data_for_phi3v
,
model
:
str
,
num_crops
:
int
,
toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure dummy_data_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the dummy data func.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
sequence_data
,
_
,
=
dummy_data_for_phi3v
(
ctx
=
ctx
,
seq_len
=
8192
,
# Should be bigger than num_imgs * toks_per_img
mm_counts
=
{
"image"
:
num_imgs
},
num_crops
=
num_crops
,
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count
=
sequence_data
.
get_token_ids
().
count
(
_IMAGE_TOKEN_ID
)
assert
img_tok_count
==
toks_per_img
*
num_imgs
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops,expected_toks_per_img,num_imgs"
,
[
(
4
,
757
,
1
),
(
4
,
757
,
2
),
(
16
,
1921
,
1
),
(
16
,
1921
,
2
),
])
def
test_input_processor_override
(
input_processor_for_phi3v
,
image_assets
:
_ImageAssets
,
model
:
str
,
num_crops
:
int
,
expected_toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
# Build the image str / prompt based on the number of images we pass
img_str
=
""
.
join
([
f
"<|image_
{
idx
}
|>
\n
"
for
idx
in
range
(
1
,
num_imgs
+
1
)])
prompt
=
f
"<|user|>
\n
{
img_str
}
<|end|>
\n
<|assistant|>
\n
"
images
=
[
image_assets
[
0
].
pil_image
]
*
num_imgs
inputs
=
token_inputs
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
),
prompt
=
prompt
,
multi_modal_data
=
{
"image"
:
images
})
processed_inputs
=
input_processor_for_phi3v
(
ctx
,
inputs
,
num_crops
=
num_crops
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
_IMAGE_TOKEN_ID
)
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
tests/models/decoder_only/vision_language/vlm_utils/__init__.py
0 → 100644
View file @
cc98f1e0
tests/models/decoder_only/vision_language/vlm_utils/builders.py
0 → 100644
View file @
cc98f1e0
"""Helpers for building inputs that can be leveraged for different test types.
"""
from
pathlib
import
PosixPath
from
typing
import
Callable
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
import
torch
from
vllm.multimodal.utils
import
(
rescale_image_size
,
rescale_video_size
,
resize_video
,
sample_frames_from_video
)
from
.....conftest
import
_ImageAssets
,
_VideoAssets
from
.types
import
(
SINGLE_IMAGE_BASE_PROMPTS
,
TEST_IMG_PLACEHOLDER
,
TEST_VIDEO_PLACEHOLDER
,
VIDEO_BASE_PROMPT
,
ImageSizeWrapper
,
SizeType
,
VLMTestInfo
)
def
replace_test_placeholder
(
prompt
:
str
,
img_idx_to_prompt
:
Callable
[[
int
],
str
],
test_placeholder
:
str
)
->
str
:
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
"""
prompt_segments
=
prompt
.
split
(
test_placeholder
)
img_prompt
=
prompt_segments
[
0
]
for
placeholder_idx
,
next_seg
in
enumerate
(
prompt_segments
[
1
:],
start
=
1
):
img_prompt
+=
img_idx_to_prompt
(
placeholder_idx
)
img_prompt
+=
next_seg
return
img_prompt
def
get_model_prompts
(
base_prompts
:
Iterable
[
str
],
img_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
video_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
prompt_formatter
:
Callable
[[
str
],
str
])
->
List
[
str
]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
Example for phi3v, given the base_prompt: "<image>What is the season?"
1. Replace img placeholder(s)
-> "<|image_1|>
\n
What is the season?"
2. Apply prompt formatter:
-> <|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"""
assert
isinstance
(
base_prompts
,
(
list
,
tuple
))
model_prompts
=
[]
for
base_prompt
in
base_prompts
:
# Replace the multimodal placeholders in the base prompt with
# the correct ones for the model that we are testing
if
img_idx_to_prompt
:
base_prompt
=
replace_test_placeholder
(
base_prompt
,
img_idx_to_prompt
,
TEST_IMG_PLACEHOLDER
)
if
video_idx_to_prompt
:
base_prompt
=
replace_test_placeholder
(
base_prompt
,
video_idx_to_prompt
,
TEST_VIDEO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
model_prompt
=
prompt_formatter
(
base_prompt
)
model_prompts
.
append
(
model_prompt
)
return
model_prompts
def
build_single_image_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
image_assets
:
_ImageAssets
,
size_wrapper
:
ImageSizeWrapper
,
tmp_path
:
Optional
[
PosixPath
]
=
None
):
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build single image inputs"
)
model_prompts
=
get_model_prompts
(
test_info
.
single_image_prompts
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
prompt_formatter
)
# For models that require a local path / URL encoded in the image; export
# assets and encode into tmp_path for this test. This should be avoided
# where possible (currently needed for Qwen-VL).
if
test_info
.
prompt_path_encoder
is
not
None
:
if
tmp_path
is
None
:
raise
ValueError
(
"Prompt path encoder requires setting local path"
)
model_prompts
=
[
test_info
.
prompt_path_encoder
(
tmp_path
,
prompt
,
[
asset
])
for
prompt
,
asset
in
zip
(
model_prompts
,
image_assets
)
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
assert
len
(
images
)
==
len
(
model_prompts
)
return
build_single_image_inputs
(
images
,
model_prompts
,
size_wrapper
)
def
build_single_image_inputs
(
images
,
model_prompts
,
size_wrapper
:
ImageSizeWrapper
):
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
# scaled by one of the size factors.
#
# NOTE: rescaling preserves the image aspect ratio.
return
[(
[
prompt
for
_
in
size_wrapper
.
data
],
[
apply_image_size_scaling
(
image
,
size
,
size_wrapper
.
type
)
for
size
in
size_wrapper
.
data
],
)
for
image
,
prompt
in
zip
(
images
,
model_prompts
)]
def
build_multi_image_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
image_assets
:
_ImageAssets
,
size_wrapper
:
ImageSizeWrapper
,
tmp_path
:
Optional
[
PosixPath
]
=
None
):
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build multi image inputs"
)
model_prompts
=
get_model_prompts
([
test_info
.
multi_image_prompt
],
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
prompt_formatter
)
if
test_info
.
prompt_path_encoder
is
not
None
:
if
tmp_path
is
None
:
raise
ValueError
(
"Prompt path encoder requires setting local path"
)
model_prompts
=
[
test_info
.
prompt_path_encoder
(
tmp_path
,
model_prompt
,
image_assets
)
for
model_prompt
in
model_prompts
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
# Currently, we only have one multi-image list & one multi-image prompt
return
build_multi_image_inputs
(
image_lists
=
[
images
],
model_prompts
=
model_prompts
,
size_wrapper
=
size_wrapper
,
)
def
build_multi_image_inputs
(
image_lists
,
model_prompts
,
size_wrapper
:
ImageSizeWrapper
):
return
[(
[
prompt
for
_
in
size_wrapper
.
data
],
[[
apply_image_size_scaling
(
image
,
size
,
size_wrapper
.
type
)
for
image
in
images
]
for
size
in
size_wrapper
.
data
],
)
for
images
,
prompt
in
zip
(
image_lists
,
model_prompts
)]
def
build_embedding_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
image_assets
:
_ImageAssets
,
size_wrapper
:
ImageSizeWrapper
,
):
# These conditions will always be true if invoked through filtering,
# but we still check them in case this is ever called directly
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build image embedding inputs"
)
if
size_wrapper
.
type
!=
SizeType
.
SIZE_FACTOR
or
not
\
all
(
factor
==
1.0
for
factor
in
size_wrapper
.
data
):
raise
ValueError
(
"Embedding tests require constant (1.0) size factors"
)
if
test_info
.
convert_assets_to_embeddings
is
None
:
raise
ValueError
(
"No conversion func for getting embeddings found"
)
model_prompts
=
get_model_prompts
(
SINGLE_IMAGE_BASE_PROMPTS
,
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
prompt_formatter
,
)
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
embeds
=
test_info
.
convert_assets_to_embeddings
(
image_assets
)
assert
len
(
images
)
==
len
(
model_prompts
)
inputs
=
build_single_image_inputs
(
images
,
model_prompts
,
size_wrapper
)
vllm_embeddings
=
build_single_image_inputs
(
embeds
,
model_prompts
,
size_wrapper
)
return
inputs
,
vllm_embeddings
def
build_video_inputs_from_test_info
(
test_info
:
VLMTestInfo
,
video_assets
:
_VideoAssets
,
size_wrapper
:
ImageSizeWrapper
,
num_frames
:
int
,
):
if
test_info
.
prompt_formatter
is
None
:
raise
ValueError
(
"Prompt formatter must be set to build video inputs"
)
model_prompts
=
get_model_prompts
(
[
VIDEO_BASE_PROMPT
],
test_info
.
img_idx_to_prompt
,
test_info
.
video_idx_to_prompt
,
test_info
.
prompt_formatter
,
)
sampled_vids
=
[
sample_frames_from_video
(
asset
.
np_ndarrays
,
num_frames
)
for
asset
in
video_assets
]
video_scaler
=
(
resize_video
if
size_wrapper
.
type
==
SizeType
.
FIXED_SIZE
else
rescale_video_size
)
return
[(
[
prompt
for
_
in
size_wrapper
.
data
],
[
video_scaler
(
video
,
size
)
for
size
in
size_wrapper
.
data
],
)
for
video
,
prompt
in
zip
(
sampled_vids
,
model_prompts
)]
def
apply_image_size_scaling
(
image
,
size
:
Union
[
float
,
Tuple
[
int
,
int
]],
size_type
:
SizeType
):
"""Applies a size scaler to one image; this can be a an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
# are considering size factors at constant scale, i.e., we just clone
# the tensor
if
isinstance
(
image
,
torch
.
Tensor
):
assert
size_type
==
SizeType
.
SIZE_FACTOR
and
size
==
1
return
image
if
size_type
==
SizeType
.
SIZE_FACTOR
:
# We have a list of image size factors
return
rescale_image_size
(
image
,
size
)
elif
size_type
==
SizeType
.
FIXED_SIZE
:
# We have a list of fixed sizes
return
image
.
resize
(
size
)
raise
ValueError
(
"ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR"
)
tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
0 → 100644
View file @
cc98f1e0
"""Utils for determining which subset of model tests belong to a specific
modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on.
"""
import
itertools
from
collections
import
OrderedDict
from
typing
import
Dict
,
Iterable
,
Tuple
import
pytest
from
.types
import
(
EMBEDDING_SIZE_FACTORS
,
ExpandableVLMTestArgs
,
ImageSizeWrapper
,
SizeType
,
VLMTestInfo
,
VLMTestType
)
def
get_filtered_test_settings
(
test_settings
:
Dict
[
str
,
VLMTestInfo
],
test_type
:
VLMTestType
,
fork_per_test
:
bool
)
->
Dict
[
str
,
VLMTestInfo
]:
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
fork_per_test.
"""
def
matches_test_type
(
test_info
:
VLMTestInfo
,
test_type
:
VLMTestType
):
return
test_info
.
test_type
==
test_type
or
(
isinstance
(
test_info
.
test_type
,
Iterable
)
and
test_type
in
test_info
.
test_type
)
matching_tests
=
{}
for
test_name
,
test_info
in
test_settings
.
items
():
# Otherwise check if the test has the right type & keep if it does
if
matches_test_type
(
test_info
,
test_type
):
# Embedding tests need to have a conversion func in their test info
if
matches_test_type
(
test_info
,
VLMTestType
.
EMBEDDING
):
assert
test_info
.
convert_assets_to_embeddings
is
not
None
# Custom test inputs need to explicitly define the mm limit/inputs
if
matches_test_type
(
test_info
,
VLMTestType
.
CUSTOM_INPUTS
):
assert
(
test_info
.
custom_test_opts
is
not
None
and
isinstance
(
test_info
.
custom_test_opts
,
Iterable
))
# For all types besides custom inputs, we need a prompt formatter
else
:
assert
test_info
.
prompt_formatter
is
not
None
# Everything looks okay; keep if this is has correct proc handling
if
(
test_info
.
distributed_executor_backend
is
not
None
)
==
fork_per_test
:
matching_tests
[
test_name
]
=
test_info
return
matching_tests
def
get_parametrized_options
(
test_settings
:
Dict
[
str
,
VLMTestInfo
],
test_type
:
VLMTestType
,
fork_new_process_for_each_test
:
bool
):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
"""
matching_tests
=
get_filtered_test_settings
(
test_settings
,
test_type
,
fork_new_process_for_each_test
)
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped
=
lambda
e
:
e
if
isinstance
(
e
,
(
list
,
tuple
))
else
(
e
,
)
def
get_model_type_cases
(
model_type
:
str
,
test_info
:
VLMTestInfo
):
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs
=
OrderedDict
([
(
"model"
,
ensure_wrapped
(
test_info
.
models
)),
(
"max_tokens"
,
ensure_wrapped
(
test_info
.
max_tokens
)),
(
"num_logprobs"
,
ensure_wrapped
(
test_info
.
num_logprobs
)),
(
"dtype"
,
ensure_wrapped
(
test_info
.
dtype
)),
(
"distributed_executor_backend"
,
ensure_wrapped
(
test_info
.
distributed_executor_backend
)),
])
# num_frames is video only
if
test_type
==
VLMTestType
.
VIDEO
:
iter_kwargs
[
"num_video_frames"
]
=
ensure_wrapped
(
test_info
.
num_video_frames
)
# No sizes passed for custom inputs, since inputs are directly provided
if
test_type
!=
VLMTestType
.
CUSTOM_INPUTS
:
wrapped_sizes
=
get_wrapped_test_sizes
(
test_info
,
test_type
)
if
wrapped_sizes
is
None
:
raise
ValueError
(
f
"Sizes must be set for test type
{
test_type
}
"
)
iter_kwargs
[
"size_wrapper"
]
=
wrapped_sizes
#Otherwise expand the custom test options instead
else
:
if
test_info
.
custom_test_opts
is
None
:
raise
ValueError
(
"Test has type CUSTOM_INPUTS, but none given"
)
iter_kwargs
[
"custom_test_opts"
]
=
test_info
.
custom_test_opts
# yapf: disable
# Wrap all model cases in a pytest parameter & pass marks through
return
[
pytest
.
param
(
model_type
,
ExpandableVLMTestArgs
(
**
{
k
:
v
for
k
,
v
in
zip
(
iter_kwargs
.
keys
(),
case
)}
),
marks
=
test_info
.
marks
if
test_info
.
marks
is
not
None
else
[]
)
for
case
in
list
(
itertools
.
product
(
*
iter_kwargs
.
values
()))
]
# yapf: enable
# Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that
# we can consume them in one mark.parametrize call.
cases_by_model_type
=
[
get_model_type_cases
(
model_type
,
test_info
)
for
model_type
,
test_info
in
matching_tests
.
items
()
]
return
list
(
itertools
.
chain
(
*
cases_by_model_type
))
def
get_wrapped_test_sizes
(
test_info
:
VLMTestInfo
,
test_type
:
VLMTestType
)
->
Tuple
[
ImageSizeWrapper
,
...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
expansion.
Args:
test_info: Test configuration to be expanded.
test_type: The type of test being filtered for.
"""
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
if
test_type
==
VLMTestType
.
EMBEDDING
:
return
tuple
([
ImageSizeWrapper
(
type
=
SizeType
.
SIZE_FACTOR
,
data
=
factor
)
for
factor
in
EMBEDDING_SIZE_FACTORS
])
# Custom inputs have preprocessed inputs
elif
test_type
==
VLMTestType
.
CUSTOM_INPUTS
:
return
tuple
()
size_factors
=
test_info
.
image_size_factors
\
if
test_info
.
image_size_factors
else
[]
fixed_sizes
=
test_info
.
image_sizes
\
if
test_info
.
image_sizes
else
[]
wrapped_factors
=
[
ImageSizeWrapper
(
type
=
SizeType
.
SIZE_FACTOR
,
data
=
factor
)
for
factor
in
size_factors
]
wrapped_sizes
=
[
ImageSizeWrapper
(
type
=
SizeType
.
FIXED_SIZE
,
data
=
size
)
for
size
in
fixed_sizes
]
return
tuple
(
wrapped_factors
+
wrapped_sizes
)
tests/models/decoder_only/vision_language/vlm_utils/core.py
0 → 100644
View file @
cc98f1e0
"""Core test implementation to be shared across modalities."""
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
Union
import
torch
from
PIL.Image
import
Image
from
transformers
import
AutoTokenizer
,
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
.....conftest
import
HfRunner
,
VllmRunner
from
.types
import
RunnerOutput
def
run_test
(
*
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
inputs
:
List
[
Tuple
[
List
[
str
],
List
[
Union
[
List
[
Image
],
Image
]]]],
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
enforce_eager
:
bool
,
max_model_len
:
int
,
max_num_seqs
:
int
,
hf_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
vllm_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
auto_cls
:
Type
[
_BaseAutoModelClass
],
use_tokenizer_eos
:
bool
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
],
comparator
:
Callable
[...,
None
],
get_stop_token_ids
:
Optional
[
Callable
[[
AutoTokenizer
],
List
[
int
]]],
limit_mm_per_prompt
:
Dict
[
str
,
int
],
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]],
task
:
str
=
"auto"
,
runner_mm_key
:
str
=
"images"
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
tensor_parallel_size
:
int
=
1
,
vllm_embeddings
:
Optional
[
torch
.
Tensor
]
=
None
,
):
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs
=
vllm_embeddings
if
vllm_embeddings
is
not
None
else
inputs
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
vllm_outputs_per_mm
=
[]
hf_outputs_per_mm
=
[]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs
=
{}
if
get_stop_token_ids
is
not
None
:
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
with
vllm_runner
(
model
,
max_model_len
=
max_model_len
,
max_num_seqs
=
max_num_seqs
,
dtype
=
dtype
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
enforce_eager
,
task
=
task
)
as
vllm_model
:
for
prompts
,
media
in
vllm_inputs
:
vllm_kwargs
[
runner_mm_key
]
=
media
vllm_output
=
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
**
vllm_kwargs
)
vllm_outputs_per_mm
.
append
(
vllm_output
)
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
auto_cls
,
postprocess_inputs
=
postprocess_inputs
,
model_kwargs
=
model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
if
patch_hf_runner
is
not
None
:
hf_model
=
patch_hf_runner
(
hf_model
)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
hf_kwargs
=
{}
if
use_tokenizer_eos
:
hf_kwargs
[
"eos_token_id"
]
=
tokenizer
.
eos_token_id
with
hf_model
,
torch
.
no_grad
():
for
prompts
,
media
in
inputs
:
hf_kwargs
[
runner_mm_key
]
=
media
hf_output
=
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
tokenizer
=
tokenizer
,
**
hf_kwargs
)
hf_outputs_per_mm
.
append
(
hf_output
)
# Apply output processing / sanitation to the vLLM and HF runner results
hf_outputs_per_mm
,
vllm_outputs_per_mm
=
process_runner_outputs
(
model
,
first_runner_outputs
=
hf_outputs_per_mm
,
second_runner_outputs
=
vllm_outputs_per_mm
,
first_runner_processor
=
hf_output_post_proc
,
second_runner_processor
=
vllm_output_post_proc
,
)
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_mm
,
vllm_outputs_per_mm
):
# This is usually check_logprobs_close, but it's passed through to
# allow things like check_outputs_equal where needed
comparator
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
def
process_runner_outputs
(
model
,
first_runner_outputs
,
second_runner_outputs
,
first_runner_processor
=
None
,
second_runner_processor
=
None
,
):
"""Applies the runner processor(s) to the runner outputs, if any."""
if
first_runner_processor
is
not
None
:
first_runner_outputs
=
process_outputs
(
first_runner_processor
,
model
,
first_runner_outputs
)
if
second_runner_processor
is
not
None
:
second_runner_outputs
=
process_outputs
(
second_runner_processor
,
model
,
second_runner_outputs
)
return
first_runner_outputs
,
second_runner_outputs
def
process_outputs
(
output_processor
,
model
,
outputs_per_image
):
"""Applies a model specific post-processor function to a runner's output"""
return
[[
output_processor
(
res
,
model
)
for
res
in
outputs
]
for
outputs
in
outputs_per_image
]
tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
0 → 100644
View file @
cc98f1e0
"""Custom input builders for edge-cases in different models."""
from
typing
import
Callable
from
vllm.multimodal.utils
import
(
rescale_image_size
,
rescale_video_size
,
resize_video
,
sample_frames_from_video
)
from
.....conftest
import
IMAGE_ASSETS
,
VIDEO_ASSETS
from
.builders
import
build_multi_image_inputs
,
build_single_image_inputs
from
.types
import
ImageSizeWrapper
,
SizeType
def
multi_image_multi_aspect_ratio_inputs
(
formatter
:
Callable
[[
str
],
str
]):
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
stop_sign
=
IMAGE_ASSETS
[
0
].
pil_image
cherry_blossom
=
IMAGE_ASSETS
[
1
].
pil_image
# Apply the selected formatter to the base prompts
img_prompts
=
[
"<image><image>
\n
Describe 2 images."
,
"<image><image>
\n
Describe 2 images."
,
"<image><image><image><image>
\n
Describe 4 images."
,
"<image>
\n
What is the season?"
,
]
formatted_prompts
=
[
formatter
(
prompt
)
for
prompt
in
img_prompts
]
return
[(
formatted_prompts
,
[
[
stop_sign
,
cherry_blossom
],
# Images with different sizes and aspect-ratios
[
rescale_image_size
(
stop_sign
,
0.1
),
stop_sign
,
],
[
stop_sign
,
rescale_image_size
(
stop_sign
,
0.25
),
cherry_blossom
.
resize
((
183
,
488
)),
cherry_blossom
.
resize
((
488
,
183
))
],
cherry_blossom
,
])]
def
multi_video_multi_aspect_ratio_inputs
(
formatter
:
Callable
[[
str
],
str
],
num_frames
:
int
=
16
):
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
video
=
sample_frames_from_video
(
VIDEO_ASSETS
[
0
].
np_ndarrays
,
num_frames
)
# Apply the selected formatter to the base prompts
video_prompts
=
[
"<video><video>
\n
Describe 2 videos."
,
"<video><video>
\n
Describe 2 videos."
,
"<video><video><video><video>
\n
Describe 4 videos."
,
"<video>
\n
Why is this video funny?"
,
]
formatted_prompts
=
[
formatter
(
prompt
)
for
prompt
in
video_prompts
]
return
[(
formatted_prompts
,
[
[
video
,
video
],
# Videos with different sizes and aspect-ratios
[
rescale_video_size
(
video
,
0.1
),
video
,
],
[
video
,
rescale_video_size
(
video
,
0.25
),
resize_video
(
video
,
(
183
,
488
)),
resize_video
(
video
,
(
488
,
183
))
],
video
,
])]
def
different_patch_input_cases_internvl
():
images
=
[
asset
.
pil_image
.
resize
((
896
,
896
))
for
asset
in
IMAGE_ASSETS
]
formatter
=
lambda
img_prompt
:
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>Assistant
\n
"
# noqa: E501
single_img_prompts
=
[
"<image>
\n
What's the content in the center of the image?"
,
"<image>
\n
What is the season?"
,
]
multi_img_prompts
=
[
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in detail.
\n
"
,
# noqa: E501
]
formatted_sprompts
=
[
formatter
(
prompt
)
for
prompt
in
single_img_prompts
]
formatted_mprompts
=
[
formatter
(
prompt
)
for
prompt
in
multi_img_prompts
]
wrapped_sf
=
ImageSizeWrapper
(
type
=
SizeType
.
SIZE_FACTOR
,
data
=
[
0.5
,
1.0
])
return
[
build_single_image_inputs
(
images
,
formatted_sprompts
,
wrapped_sf
),
build_multi_image_inputs
([
images
],
formatted_mprompts
,
wrapped_sf
),
]
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
0 → 100644
View file @
cc98f1e0
"""Common utility functions relating to different models that are useful
for manipulating the input / output of HF & vLLM test runners, which are
typically specific to a small subset of models.
"""
import
re
import
types
from
pathlib
import
PosixPath
from
typing
import
Callable
,
List
,
Optional
,
Tuple
,
Union
import
torch
from
PIL.Image
import
Image
from
transformers
import
AutoConfig
,
AutoTokenizer
,
BatchEncoding
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
.....conftest
import
HfRunner
,
ImageAsset
,
_ImageAssets
from
.types
import
RunnerOutput
####### vLLM output processors functions
def
blip2_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
_
,
output_str
,
out_logprobs
=
vllm_output
hf_output_str
=
output_str
+
"
\n
"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
hf_output_ids
=
tokenizer
.
encode
(
hf_output_str
)
assert
hf_output_ids
[
0
]
==
tokenizer
.
bos_token_id
hf_output_ids
=
hf_output_ids
[
1
:]
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
fuyu_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
hf_output_str
=
output_str
.
lstrip
()
+
"|ENDOFTEXT|"
return
output_ids
,
hf_output_str
,
out_logprobs
def
qwen_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]:
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
hf_output_str
=
output_str
+
"<|endoftext|>"
return
output_ids
,
hf_output_str
,
out_logprobs
def
llava_image_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
config
=
AutoConfig
.
from_pretrained
(
model
)
mm_token_id
=
config
.
image_token_index
return
_llava_vllm_to_hf_output
(
vllm_output
,
model
,
mm_token_id
)
def
llava_video_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]:
config
=
AutoConfig
.
from_pretrained
(
model
)
mm_token_id
=
config
.
video_token_index
return
_llava_vllm_to_hf_output
(
vllm_output
,
model
,
mm_token_id
)
def
_llava_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
,
mm_token_id
:
int
)
->
RunnerOutput
:
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
hf_output_ids
=
[
token_id
for
idx
,
token_id
in
enumerate
(
output_ids
)
if
token_id
!=
mm_token_id
or
output_ids
[
idx
-
1
]
!=
mm_token_id
]
assert
output_str
[
0
]
==
" "
hf_output_str
=
output_str
[
1
:]
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
llava_onevision_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
config
=
AutoConfig
.
from_pretrained
(
model
)
video_token_id
=
config
.
video_token_index
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
hf_output_ids
=
[
token_id
for
idx
,
token_id
in
enumerate
(
output_ids
)
if
token_id
!=
video_token_id
or
output_ids
[
idx
-
1
]
!=
video_token_id
]
hf_output_str
=
output_str
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
phi3v_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
_
,
output_str
,
out_logprobs
=
vllm_output
output_str_without_image
=
re
.
sub
(
r
"(<\|image_\d+\|>)+"
,
""
,
output_str
)
assert
output_str_without_image
[
0
]
==
" "
output_str_without_image
=
output_str_without_image
[
1
:]
hf_output_str
=
output_str_without_image
+
"<|end|><|endoftext|>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
hf_output_ids
=
tokenizer
.
encode
(
output_str_without_image
)
assert
hf_output_ids
[
0
]
==
1
hf_output_ids
=
hf_output_ids
[
1
:]
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
paligemma_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
config
=
AutoConfig
.
from_pretrained
(
model
)
image_token_id
=
config
.
image_token_index
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
hf_output_ids
=
[
token_id
for
idx
,
token_id
in
enumerate
(
output_ids
)
if
token_id
!=
image_token_id
or
output_ids
[
idx
-
1
]
!=
image_token_id
]
hf_output_str
=
output_str
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
####### Post-processors for HF outputs
def
minicmpv_trunc_hf_output
(
hf_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
output_ids
,
output_str
,
out_logprobs
=
hf_output
if
output_str
.
endswith
(
"<|eot_id|>"
):
output_str
=
output_str
.
split
(
"<|eot_id|>"
)[
0
]
return
output_ids
,
output_str
,
out_logprobs
####### Functions for converting image assets to embeddings
def
get_llava_embeddings
(
image_assets
:
_ImageAssets
):
return
[
asset
.
image_embeds
for
asset
in
image_assets
]
####### postprocessors to run on HF BatchEncoding
def
get_key_type_post_processor
(
hf_inp_key
:
str
)
->
Callable
[[
BatchEncoding
,
str
],
BatchEncoding
]:
"""Gets a handle to a post processor which converts a given key into a
target data type."""
def
process
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
hf_inputs
[
hf_inp_key
]
=
hf_inputs
[
hf_inp_key
].
to
(
torch_dtype
)
return
hf_inputs
return
process
def
wrap_inputs_post_processor
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
return
{
"model_inputs"
:
hf_inputs
}
####### Prompt path encoders for models that need models on disk
def
qwen_prompt_path_encoder
(
tmp_path
:
PosixPath
,
prompt
:
str
,
assets
:
Union
[
List
[
ImageAsset
],
_ImageAssets
])
->
str
:
"""Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that
the HF version of Qwen-VL can resolve the path and load the image in its
forward() call.
Args:
tmp_path: Tempdir for test under consideration.
prompt: Prompt with image placeholders.
assets: List of image assets whose len equals the num placeholders.
"""
# Ensure that the number of placeholders matches the number of assets;
# If this is not true, the test is probably written incorrectly.
assert
prompt
.
count
(
"<img></img>"
)
==
len
(
assets
)
# Replace the placeholders with local paths to the exported assets
for
asset
in
assets
:
image_tmp_path
=
tmp_path
/
f
"
{
asset
.
name
}
.jpg"
asset
.
pil_image
.
save
(
image_tmp_path
)
prompt
=
prompt
.
replace
(
"<img></img>"
,
f
"<img>
{
image_tmp_path
}
</img>"
,
1
,
)
return
prompt
####### Model-specific HuggingFace runner patchers
def
glm_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for GLM4."""
hf_processor
=
hf_model
.
processor
patch_padding_side
(
hf_processor
)
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
**
kwargs
):
if
images
is
None
:
return
hf_processor
(
*
args
,
**
kwargs
)
return
hf_processor
.
apply_chat_template
(
[{
"role"
:
"user"
,
"image"
:
images
,
"content"
:
text
}],
add_generation_prompt
=
True
,
tokenize
=
True
,
return_dict
=
True
,
**
kwargs
,
)
hf_model
.
processor
=
processor
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
transformer
.
output_layer
return
hf_model
def
internvl_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for InternVL."""
class
InternVLProcessor
:
"""A simple processor for InternVL2 which misses a processor."""
def
__init__
(
self
,
hf_runner
:
HfRunner
):
self
.
num_image_token
=
hf_runner
.
model
.
num_image_token
self
.
tokenizer
=
hf_runner
.
tokenizer
self
.
dtype
=
hf_runner
.
model
.
dtype
self
.
config
=
AutoConfig
.
from_pretrained
(
hf_runner
.
model_name
,
trust_remote_code
=
True
)
self
.
vision_config
=
self
.
config
.
vision_config
self
.
use_thumbnail
=
self
.
config
.
use_thumbnail
self
.
min_num
=
self
.
config
.
min_dynamic_patch
self
.
max_num
=
self
.
config
.
max_dynamic_patch
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Union
[
Image
,
List
[
Image
]],
**
kwargs
):
from
vllm.model_executor.models.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values
)
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
image_to_pixel_values
(
image
,
self
.
image_size
,
self
.
min_num
,
self
.
max_num
,
self
.
use_thumbnail
).
to
(
self
.
dtype
)
for
image
in
images
]
num_patches_list
=
[
pixel_value
.
shape
[
0
]
for
pixel_value
in
pixel_values
]
pixel_values
=
torch
.
cat
(
pixel_values
,
dim
=
0
)
for
num_patches
in
num_patches_list
:
context_tokens
=
IMG_CONTEXT
*
self
.
num_image_token
\
*
num_patches
image_tokens
=
IMG_START
+
context_tokens
+
IMG_END
text
=
text
.
replace
(
'<image>'
,
image_tokens
,
1
)
prompt
=
self
.
tokenizer
(
text
,
return_tensors
=
"pt"
)
prompt
.
update
({
"pixel_values"
:
pixel_values
})
return
prompt
img_context_token_id
=
hf_model
.
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
hf_model
.
model
.
img_context_token_id
=
img_context_token_id
hf_model
.
processor
=
InternVLProcessor
(
hf_model
)
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
language_model
.
get_output_embeddings
()
hf_model
.
model
.
generate
=
types
.
MethodType
(
_internvl_generate
,
hf_model
.
model
)
return
hf_model
def
_internvl_generate
(
self
,
pixel_values
:
torch
.
FloatTensor
,
input_ids
:
torch
.
FloatTensor
,
attention_mask
:
Optional
[
torch
.
LongTensor
]
=
None
,
**
generate_kwargs
,
)
->
torch
.
LongTensor
:
"""Generate method for InternVL2 model without fixed use_cache."""
assert
self
.
img_context_token_id
is
not
None
vit_embeds
=
self
.
extract_feature
(
pixel_values
)
input_embeds
=
self
.
language_model
.
get_input_embeddings
()(
input_ids
)
B
,
N
,
C
=
input_embeds
.
shape
input_embeds
=
input_embeds
.
reshape
(
B
*
N
,
C
)
input_ids
=
input_ids
.
reshape
(
B
*
N
)
selected
=
(
input_ids
==
self
.
img_context_token_id
)
assert
selected
.
sum
()
!=
0
input_embeds
[
selected
]
=
vit_embeds
.
reshape
(
-
1
,
C
).
to
(
input_embeds
.
device
)
input_embeds
=
input_embeds
.
reshape
(
B
,
N
,
C
)
forward_kwargs
=
dict
(
inputs_embeds
=
input_embeds
,
attention_mask
=
attention_mask
,
)
if
getattr
(
self
,
"use_visual_token_mask"
,
False
):
visual_token_mask
=
selected
.
reshape
(
B
,
N
,
1
).
to
(
input_embeds
.
dtype
)
forward_kwargs
[
"visual_token_mask"
]
=
visual_token_mask
outputs
=
self
.
language_model
.
generate
(
**
forward_kwargs
,
**
generate_kwargs
,
)
return
outputs
tests/models/decoder_only/vision_language/vlm_utils/runners.py
0 → 100644
View file @
cc98f1e0
"""Entrypoints for wrapping the core run_test implementation for specific test
types / modalities.
"""
from
pathlib
import
PosixPath
from
typing
import
Type
from
.....conftest
import
HfRunner
,
VllmRunner
,
_ImageAssets
,
_VideoAssets
from
.
import
builders
,
core
from
.types
import
ExpandableVLMTestArgs
,
VLMTestInfo
####### Entrypoints for running different test types
def
run_single_image_test
(
*
,
tmp_path
:
PosixPath
,
model_test_info
:
VLMTestInfo
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
assert
test_case
.
size_wrapper
is
not
None
inputs
=
builders
.
build_single_image_inputs_from_test_info
(
model_test_info
,
image_assets
,
test_case
.
size_wrapper
,
tmp_path
)
core
.
run_test
(
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
inputs
=
inputs
,
model
=
test_case
.
model
,
dtype
=
test_case
.
dtype
,
max_tokens
=
test_case
.
max_tokens
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"image"
:
1
},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
def
run_multi_image_test
(
*
,
tmp_path
:
PosixPath
,
model_test_info
:
VLMTestInfo
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
assert
test_case
.
size_wrapper
is
not
None
inputs
=
builders
.
build_multi_image_inputs_from_test_info
(
model_test_info
,
image_assets
,
test_case
.
size_wrapper
,
tmp_path
)
core
.
run_test
(
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
inputs
=
inputs
,
model
=
test_case
.
model
,
dtype
=
test_case
.
dtype
,
max_tokens
=
test_case
.
max_tokens
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_assets
)},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
def
run_embedding_test
(
*
,
model_test_info
:
VLMTestInfo
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
assert
test_case
.
size_wrapper
is
not
None
inputs
,
vllm_embeddings
=
builders
.
build_embedding_inputs_from_test_info
(
model_test_info
,
image_assets
,
test_case
.
size_wrapper
)
core
.
run_test
(
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
inputs
=
inputs
,
model
=
test_case
.
model
,
dtype
=
test_case
.
dtype
,
max_tokens
=
test_case
.
max_tokens
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"image"
:
1
},
vllm_embeddings
=
vllm_embeddings
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
def
run_video_test
(
*
,
model_test_info
:
VLMTestInfo
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
video_assets
:
_VideoAssets
,
):
assert
test_case
.
size_wrapper
is
not
None
assert
test_case
.
num_video_frames
is
not
None
inputs
=
builders
.
build_video_inputs_from_test_info
(
model_test_info
,
video_assets
,
test_case
.
size_wrapper
,
test_case
.
num_video_frames
)
core
.
run_test
(
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
inputs
=
inputs
,
model
=
test_case
.
model
,
dtype
=
test_case
.
dtype
,
max_tokens
=
test_case
.
max_tokens
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
{
"video"
:
len
(
video_assets
)},
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
def
run_custom_inputs_test
(
*
,
model_test_info
:
VLMTestInfo
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
]):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
assert
test_case
.
custom_test_opts
is
not
None
inputs
=
test_case
.
custom_test_opts
.
inputs
limit_mm_per_prompt
=
test_case
.
custom_test_opts
.
limit_mm_per_prompt
assert
inputs
is
not
None
and
limit_mm_per_prompt
is
not
None
core
.
run_test
(
hf_runner
=
hf_runner
,
vllm_runner
=
vllm_runner
,
inputs
=
inputs
,
model
=
test_case
.
model
,
dtype
=
test_case
.
dtype
,
max_tokens
=
test_case
.
max_tokens
,
num_logprobs
=
test_case
.
num_logprobs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
distributed_executor_backend
=
test_case
.
distributed_executor_backend
,
**
model_test_info
.
get_non_parametrized_runner_kwargs
())
tests/models/decoder_only/vision_language/vlm_utils/types.py
0 → 100644
View file @
cc98f1e0
"""Types for writing multimodal model tests."""
from
enum
import
Enum
from
pathlib
import
PosixPath
from
typing
import
(
Any
,
Callable
,
Dict
,
Iterable
,
List
,
NamedTuple
,
Optional
,
Tuple
,
Type
,
Union
)
import
torch
from
PIL.Image
import
Image
from
pytest
import
MarkDecorator
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
,
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
identity
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
from
....utils
import
check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
TEST_IMG_PLACEHOLDER
=
"<vlm_image>"
TEST_VIDEO_PLACEHOLDER
=
"<vlm_video>"
# yapf: disable
SINGLE_IMAGE_BASE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
f
"
{
TEST_IMG_PLACEHOLDER
}
What's the content of the image?"
,
"cherry_blossom"
:
f
"
{
TEST_IMG_PLACEHOLDER
}
What is the season?"
,
})
MULTI_IMAGE_BASE_PROMPT
=
f
"Image-1:
{
TEST_IMG_PLACEHOLDER
}
Image-2:
{
TEST_IMG_PLACEHOLDER
}
Describe the two images in detail.
\n
"
# noqa: E501
VIDEO_BASE_PROMPT
=
f
"
{
TEST_VIDEO_PLACEHOLDER
}
Why is this video funny?"
IMAGE_SIZE_FACTORS
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
),
(
0.25
,
0.5
,
1.0
)]
EMBEDDING_SIZE_FACTORS
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
)]
RunnerOutput
=
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]
# yapf: enable
class
VLMTestType
(
Enum
):
IMAGE
=
1
MULTI_IMAGE
=
2
EMBEDDING
=
3
VIDEO
=
4
CUSTOM_INPUTS
=
5
class
SizeType
(
Enum
):
SIZE_FACTOR
=
1
FIXED_SIZE
=
2
class
CustomTestOptions
(
NamedTuple
):
inputs
:
List
[
Tuple
[
List
[
str
],
List
[
Union
[
List
[
Image
],
Image
]]]]
limit_mm_per_prompt
:
Dict
[
str
,
int
]
class
ImageSizeWrapper
(
NamedTuple
):
type
:
SizeType
# A size factor is a wrapper of 0+ floats,
# while a fixed size contains an iterable of integer pairs
data
:
Union
[
Iterable
[
float
],
Iterable
[
Tuple
[
int
,
int
]]]
class
VLMTestInfo
(
NamedTuple
):
"""Holds the configuration for 1+ tests for one model architecture."""
models
:
Union
[
List
[
str
]]
test_type
:
Union
[
VLMTestType
,
Iterable
[
VLMTestType
]]
# Should be None only if this is a CUSTOM_INPUTS test
prompt_formatter
:
Optional
[
Callable
[[
str
],
str
]]
=
None
img_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<image>
\n
"
video_idx_to_prompt
:
Callable
[[
int
],
str
]
=
lambda
idx
:
"<video>
\n
"
# Most models work on the single / multi-image prompts above, but in some
# cases the log prob check fails, e.g., for paligemma. We allow passing
# an override for the single image prompts / multi-image prompt for this
# reason.
single_image_prompts
:
Iterable
[
str
]
=
SINGLE_IMAGE_BASE_PROMPTS
multi_image_prompt
:
str
=
MULTI_IMAGE_BASE_PROMPT
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings
:
Optional
[
Callable
[[
_ImageAssets
],
torch
.
Tensor
]]
=
None
# Exposed options for vLLM runner; we change these in a several tests,
# but the defaults are derived from VllmRunner & the engine defaults
# These settings are chosen to avoid OOMs when running in the CI
enforce_eager
:
bool
=
True
max_model_len
:
int
=
1024
max_num_seqs
:
int
=
256
task
:
str
=
"auto"
tensor_parallel_size
:
int
=
1
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids
:
Optional
[
Callable
[[
AutoTokenizer
],
List
[
int
]]]
=
None
# Exposed options for HF runner
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
# Indicates we should explicitly pass the EOS from the tokeniezr
use_tokenizer_eos
:
bool
=
False
auto_cls
:
Type
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
# Callable to pass to the HF runner to run on inputs; for now, we also pass
# the data type to input post processing, because almost all of the uses of
# postprocess_inputs are to fix the data types of BatchEncoding values.
postprocess_inputs
:
Callable
[[
BatchEncoding
,
str
],
BatchEncoding
]
=
identity
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]]
=
None
# Post processors that if defined, will run oun the outputs of the
# vLLM and HF runner, respectively (useful for sanitization, etc).
vllm_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]]
=
None
hf_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]]
=
None
# Consumes the output of the callables above and checks if they're equal
comparator
:
Callable
[...,
None
]
=
check_logprobs_close
# Default expandable params per test; these defaults can be overridden in
# instances of this object; the complete set of test cases for the model
# is all combinations of .models + all fields below
max_tokens
:
Union
[
int
,
Tuple
[
int
]]
=
128
num_logprobs
:
Union
[
int
,
Tuple
[
int
]]
=
5
dtype
:
Union
[
str
,
Iterable
[
str
]]
=
"half"
distributed_executor_backend
:
Optional
[
Union
[
str
,
Iterable
[
str
]]]
=
None
# Only expanded in video tests
num_video_frames
:
Union
[
int
,
Tuple
[
int
]]
=
16
# Fixed image sizes / image size factors; most tests use image_size_factors
# The values provided for these two fields will be stacked and expanded
# such that each model will consider each image size factor / image size
# once per tests (much like concatenating and wrapping in one parametrize
# call)
image_size_factors
:
Iterable
[
Iterable
[
float
]]
=
IMAGE_SIZE_FACTORS
image_sizes
:
Optional
[
Iterable
[
Iterable
[
Tuple
[
int
,
int
]]]]
=
None
# Hack for updating a prompt to take into a local path; currently only used
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder
:
Optional
[
Callable
[[
PosixPath
,
str
,
Union
[
List
[
ImageAsset
],
_ImageAssets
]],
str
]]
=
None
# noqa: E501
# kwarg to pass multimodal data in as to vllm/hf runner instances
runner_mm_key
:
str
=
"images"
# Allows configuring a test to run with custom inputs
custom_test_opts
:
Optional
[
List
[
CustomTestOptions
]]
=
None
marks
:
Optional
[
List
[
MarkDecorator
]]
=
None
def
get_non_parametrized_runner_kwargs
(
self
):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
test cases.
"""
return
{
"enforce_eager"
:
self
.
enforce_eager
,
"max_model_len"
:
self
.
max_model_len
,
"max_num_seqs"
:
self
.
max_num_seqs
,
"task"
:
self
.
task
,
"hf_output_post_proc"
:
self
.
hf_output_post_proc
,
"vllm_output_post_proc"
:
self
.
vllm_output_post_proc
,
"auto_cls"
:
self
.
auto_cls
,
"use_tokenizer_eos"
:
self
.
use_tokenizer_eos
,
"postprocess_inputs"
:
self
.
postprocess_inputs
,
"comparator"
:
self
.
comparator
,
"get_stop_token_ids"
:
self
.
get_stop_token_ids
,
"model_kwargs"
:
self
.
model_kwargs
,
"patch_hf_runner"
:
self
.
patch_hf_runner
,
"runner_mm_key"
:
self
.
runner_mm_key
,
}
class
ExpandableVLMTestArgs
(
NamedTuple
):
"""The expanded kwargs which correspond to a single test case."""
model
:
str
max_tokens
:
int
num_logprobs
:
int
dtype
:
str
distributed_executor_backend
:
Optional
[
str
]
# Sizes are used for everything except for custom input tests
size_wrapper
:
Optional
[
ImageSizeWrapper
]
=
None
# Video only
num_video_frames
:
Optional
[
int
]
=
None
# Custom inputs only
custom_test_opts
:
Optional
[
CustomTestOptions
]
=
None
tests/models/embedding/vision_language/test_llava_next.py
View file @
cc98f1e0
...
...
@@ -85,6 +85,8 @@ def _run_test(
)
# FIXME
@
pytest
.
mark
.
skip
(
reason
=
"LLava next embedding tests currently fail"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_models_text
(
...
...
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
cc98f1e0
...
...
@@ -192,7 +192,7 @@ def _run_test(
for
prompts
,
images
in
inputs
]
def
process
(
hf_inputs
:
BatchEncoding
):
def
process
(
hf_inputs
:
BatchEncoding
,
**
kwargs
):
return
hf_inputs
with
hf_runner
(
model
,
...
...
tests/utils.py
View file @
cc98f1e0
...
...
@@ -561,12 +561,11 @@ def fork_new_process_for_each_test(
return
wrapper
def
large_gpu_test
(
*
,
min_gb
:
int
):
"""
Decorate a test to be skipped if no GPU is available or it does not have
sufficient memory.
Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
def
large_gpu_mark
(
min_gb
:
int
)
->
pytest
.
MarkDecorator
:
"""Gets a pytest skipif mark, which triggers ig the the device doesn't have
meet a minimum memory requirement in gb; can be leveraged via
@large_gpu_test to skip tests in environments without enough resources, or
called when filtering tests to run directly.
"""
try
:
if
current_platform
.
is_cpu
():
...
...
@@ -578,14 +577,23 @@ def large_gpu_test(*, min_gb: int):
f
"An error occurred when finding the available memory:
{
e
}
"
,
stacklevel
=
2
,
)
memory_gb
=
0
test_skipif
=
pytest
.
mark
.
skipif
(
return
pytest
.
mark
.
skipif
(
memory_gb
<
min_gb
,
reason
=
f
"Need at least
{
memory_gb
}
GB GPU memory to run the test."
,
)
def
large_gpu_test
(
*
,
min_gb
:
int
):
"""
Decorate a test to be skipped if no GPU is available or it does not have
sufficient memory.
Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
"""
test_skipif
=
large_gpu_mark
(
min_gb
)
def
wrapper
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
return
test_skipif
(
f
)
...
...
vllm/utils.py
View file @
cc98f1e0
...
...
@@ -977,7 +977,8 @@ def enable_trace_function_call_for_thread() -> None:
# `functools` helpers
def
identity
(
value
:
T
)
->
T
:
def
identity
(
value
:
T
,
**
kwargs
)
->
T
:
"""Returns the first provided value."""
return
value
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment