Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
807 additions
and
370 deletions
+807
-370
tests/model_executor/test_model_load_with_params.py
tests/model_executor/test_model_load_with_params.py
+2
-2
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+11
-22
tests/models/decoder_only/language/test_fp8.py
tests/models/decoder_only/language/test_fp8.py
+101
-36
tests/models/decoder_only/language/test_gguf.py
tests/models/decoder_only/language/test_gguf.py
+13
-13
tests/models/decoder_only/language/test_hybrid.py
tests/models/decoder_only/language/test_hybrid.py
+34
-29
tests/models/decoder_only/language/test_mamba.py
tests/models/decoder_only/language/test_mamba.py
+0
-7
tests/models/decoder_only/language/test_mistral.py
tests/models/decoder_only/language/test_mistral.py
+12
-10
tests/models/decoder_only/language/test_modelopt.py
tests/models/decoder_only/language/test_modelopt.py
+1
-2
tests/models/decoder_only/language/test_models.py
tests/models/decoder_only/language/test_models.py
+48
-16
tests/models/decoder_only/language/test_nvfp4.py
tests/models/decoder_only/language/test_nvfp4.py
+82
-0
tests/models/decoder_only/vision_language/test_awq.py
tests/models/decoder_only/vision_language/test_awq.py
+9
-4
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+150
-114
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+5
-5
tests/models/decoder_only/vision_language/test_phi4mm.py
tests/models/decoder_only/vision_language/test_phi4mm.py
+228
-0
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+33
-60
tests/models/decoder_only/vision_language/test_qwen2_vl.py
tests/models/decoder_only/vision_language/test_qwen2_vl.py
+34
-24
tests/models/decoder_only/vision_language/vlm_utils/builders.py
...models/decoder_only/vision_language/vlm_utils/builders.py
+4
-3
tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
.../decoder_only/vision_language/vlm_utils/case_filtering.py
+9
-9
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+13
-14
tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
...s/decoder_only/vision_language/vlm_utils/custom_inputs.py
+18
-0
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
tests/model_executor/test_model_load_with_params.py
View file @
469e903b
...
@@ -15,7 +15,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", os.path.join(models_path_prefix, "BAAI
...
@@ -15,7 +15,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", os.path.join(models_path_prefix, "BAAI
REVISION
=
os
.
environ
.
get
(
"REVISION"
,
"main"
)
REVISION
=
os
.
environ
.
get
(
"REVISION"
,
"main"
)
MODEL_NAME_ROBERTA
=
os
.
environ
.
get
(
"MODEL_NAME"
,
MODEL_NAME_ROBERTA
=
os
.
environ
.
get
(
"MODEL_NAME"
,
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-
large
"
))
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-
small
"
))
REVISION_ROBERTA
=
os
.
environ
.
get
(
"REVISION"
,
"main"
)
REVISION_ROBERTA
=
os
.
environ
.
get
(
"REVISION"
,
"main"
)
...
@@ -84,7 +84,7 @@ def test_roberta_model_loading_with_params(vllm_runner):
...
@@ -84,7 +84,7 @@ def test_roberta_model_loading_with_params(vllm_runner):
assert
model_config
.
pooler_config
.
pooling_norm
assert
model_config
.
pooler_config
.
pooling_norm
# asserts on the tokenizer loaded
# asserts on the tokenizer loaded
assert
model_tokenizer
.
tokenizer_id
==
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-
large
"
)
assert
model_tokenizer
.
tokenizer_id
==
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-
small
"
)
assert
not
model_tokenizer
.
tokenizer_config
[
"do_lower_case"
]
assert
not
model_tokenizer
.
tokenizer_config
[
"do_lower_case"
]
def
check_model
(
model
):
def
check_model
(
model
):
...
...
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
Optional
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
os
import
os
import
pytest_asyncio
import
pytest_asyncio
from
transformers
import
AutoModel
,
AutoTokenizer
,
BatchEncoding
from
transformers
import
AutoModel
,
AutoTokenizer
from
vllm.multimodal.audio
import
resample_audio
from
vllm.multimodal.audio
import
resample_audio
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
HfRunner
,
VllmRunner
from
....conftest
import
HfRunner
,
VllmRunner
...
@@ -20,7 +19,7 @@ from ...utils import check_logprobs_close
...
@@ -20,7 +19,7 @@ from ...utils import check_logprobs_close
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
)
AudioTuple
=
T
uple
[
np
.
ndarray
,
int
]
AudioTuple
=
t
uple
[
np
.
ndarray
,
int
]
VLLM_PLACEHOLDER
=
"<|audio|>"
VLLM_PLACEHOLDER
=
"<|audio|>"
HF_PLACEHOLDER
=
"<|audio|>"
HF_PLACEHOLDER
=
"<|audio|>"
...
@@ -81,7 +80,7 @@ def _get_prompt(audio_count, question, placeholder):
...
@@ -81,7 +80,7 @@ def _get_prompt(audio_count, question, placeholder):
add_generation_prompt
=
True
)
add_generation_prompt
=
True
)
def
vllm_to_hf_output
(
vllm_output
:
T
uple
[
L
ist
[
int
],
str
,
def
vllm_to_hf_output
(
vllm_output
:
t
uple
[
l
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
Optional
[
SampleLogprobs
]],
model
:
str
):
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
"""Sanitize vllm output to be comparable with hf output."""
...
@@ -99,9 +98,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
...
@@ -99,9 +98,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
def
run_test
(
def
run_test
(
hf_runner
:
T
ype
[
HfRunner
],
hf_runner
:
t
ype
[
HfRunner
],
vllm_runner
:
T
ype
[
VllmRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
prompts_and_audios
:
L
ist
[
T
uple
[
str
,
str
,
AudioTuple
]],
prompts_and_audios
:
l
ist
[
t
uple
[
str
,
str
,
AudioTuple
]],
model
:
str
,
model
:
str
,
*
,
*
,
dtype
:
str
,
dtype
:
str
,
...
@@ -110,8 +109,6 @@ def run_test(
...
@@ -110,8 +109,6 @@ def run_test(
**
kwargs
,
**
kwargs
,
):
):
"""Inference result should be the same between hf and vllm."""
"""Inference result should be the same between hf and vllm."""
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# if we run HF first, the cuda initialization will be done and it
...
@@ -127,15 +124,7 @@ def run_test(
...
@@ -127,15 +124,7 @@ def run_test(
for
vllm_prompt
,
_
,
audio
in
prompts_and_audios
for
vllm_prompt
,
_
,
audio
in
prompts_and_audios
]
]
def
process
(
hf_inputs
:
BatchEncoding
,
**
kwargs
):
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_inputs
[
"audio_values"
]
=
hf_inputs
[
"audio_values"
]
\
.
to
(
torch_dtype
)
# type: ignore
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_outputs_per_audio
=
[
hf_outputs_per_audio
=
[
hf_model
.
generate_greedy_logprobs_limit
(
hf_model
.
generate_greedy_logprobs_limit
(
[
hf_prompt
],
[
hf_prompt
],
...
@@ -161,8 +150,8 @@ def run_test(
...
@@ -161,8 +150,8 @@ def run_test(
def
run_multi_audio_test
(
def
run_multi_audio_test
(
vllm_runner
:
T
ype
[
VllmRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
prompts_and_audios
:
L
ist
[
T
uple
[
str
,
L
ist
[
AudioTuple
]]],
prompts_and_audios
:
l
ist
[
t
uple
[
str
,
l
ist
[
AudioTuple
]]],
model
:
str
,
model
:
str
,
*
,
*
,
dtype
:
str
,
dtype
:
str
,
...
@@ -190,7 +179,7 @@ def run_multi_audio_test(
...
@@ -190,7 +179,7 @@ def run_multi_audio_test(
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
half
"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
bfloat16
"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"vllm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"vllm_kwargs"
,
[
...
...
tests/models/decoder_only/language/test_fp8.py
View file @
469e903b
...
@@ -11,12 +11,12 @@ import pytest
...
@@ -11,12 +11,12 @@ import pytest
from
tests.kernels.utils
import
override_backend_env_variable
from
tests.kernels.utils
import
override_backend_env_variable
from
tests.quantization.utils
import
is_quant_method_supported
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.platforms
import
current_platform
from
vllm.utils
import
STR_BACKEND_ENV_VAR
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
...
@@ -55,42 +55,107 @@ def test_models(
...
@@ -55,42 +55,107 @@ def test_models(
backend
:
str
,
backend
:
str
,
tensor_parallel_size
:
int
,
tensor_parallel_size
:
int
,
disable_async_output_proc
:
bool
,
disable_async_output_proc
:
bool
,
monkeypatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"TOKENIZERS_PARALLELISM"
,
'true'
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
backend
)
MAX_MODEL_LEN
=
1024
NUM_LOG_PROBS
=
8
with
vllm_runner
(
base_model
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
"auto"
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
baseline_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
test_model
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
test_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
outputs_0_lst
=
baseline_outputs
,
outputs_1_lst
=
test_outputs
,
name_0
=
"fp16_kv_cache"
,
name_1
=
"fp8_kv_cache"
,
)
@
pytest
.
mark
.
cpu_model
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cpu
(),
reason
=
"test for the CPU backend."
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype,base_model,test_model"
,
[
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
(
"fp8_e5m2"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
"meta-llama/Llama-3.2-1B-Instruct"
),
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
# Due to low-precision numerical divergence, this test is too sensitive for
# the async postprocessor
@
pytest
.
mark
.
parametrize
(
"disable_async_output_proc"
,
[
True
])
def
test_cpu_models
(
vllm_runner
,
example_prompts
,
kv_cache_dtype
:
str
,
base_model
:
str
,
test_model
:
str
,
max_tokens
:
int
,
disable_async_output_proc
:
bool
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
)
->
None
:
"""
"""
Only checks log probs match to cover the discrepancy in
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
numerical sensitive kernels.
"""
"""
override_backend_env_variable
(
monkeypatch
,
backend
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"TOKENIZERS_PARALLELISM"
,
'true'
)
MAX_MODEL_LEN
=
1024
NUM_LOG_PROBS
=
8
MAX_MODEL_LEN
=
1024
NUM_LOG_PROBS
=
8
with
vllm_runner
(
base_model
,
with
vllm_runner
(
max_model_len
=
MAX_MODEL_LEN
,
base_model
,
tensor_parallel_size
=
tensor_parallel_size
,
max_model_len
=
MAX_MODEL_LEN
,
enforce_eager
=
enforce_eager
,
dtype
=
"bfloat16"
,
kv_cache_dtype
=
"auto"
,
kv_cache_dtype
=
"auto"
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
)
as
vllm_model
:
baseline_outputs
=
vllm_model
.
generate_greedy_logprobs
(
baseline_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
with
vllm_runner
(
test_model
,
test_model
,
max_model_len
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tensor_parallel_size
,
dtype
=
"bfloat16"
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
)
as
vllm_model
:
test_outputs
=
vllm_model
.
generate_greedy_logprobs
(
test_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
baseline_outputs
,
outputs_0_lst
=
baseline_outputs
,
outputs_1_lst
=
test_outputs
,
outputs_1_lst
=
test_outputs
,
name_0
=
"bf16_kv_cache"
,
name_0
=
"fp16_kv_cache"
,
name_1
=
"fp8_kv_cache"
,
name_1
=
"fp8_kv_cache"
,
)
)
tests/models/decoder_only/language/test_gguf.py
View file @
469e903b
...
@@ -5,7 +5,7 @@ Note: To pass the test, quantization higher than Q4 should be used
...
@@ -5,7 +5,7 @@ Note: To pass the test, quantization higher than Q4 should be used
"""
"""
import
os
import
os
from
typing
import
List
,
NamedTuple
,
Type
from
typing
import
NamedTuple
import
pytest
import
pytest
from
huggingface_hub
import
hf_hub_download
from
huggingface_hub
import
hf_hub_download
...
@@ -91,8 +91,8 @@ MODELS = [
...
@@ -91,8 +91,8 @@ MODELS = [
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
def
test_models
(
def
test_models
(
num_gpus_available
:
int
,
num_gpus_available
:
int
,
vllm_runner
:
T
ype
[
VllmRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
example_prompts
:
L
ist
[
str
],
example_prompts
:
l
ist
[
str
],
model
:
GGUFTestConfig
,
model
:
GGUFTestConfig
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
...
@@ -111,16 +111,6 @@ def test_models(
...
@@ -111,16 +111,6 @@ def test_models(
example_prompts
=
tokenizer
.
apply_chat_template
(
example_prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Run unquantized model.
with
vllm_runner
(
model_name
=
model
.
original_model
,
enforce_eager
=
True
,
# faster tests
dtype
=
dtype
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tp_size
)
as
original_model
:
original_outputs
=
original_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
# Run gguf model.
# Run gguf model.
with
vllm_runner
(
model_name
=
model
.
gguf_model
,
with
vllm_runner
(
model_name
=
model
.
gguf_model
,
enforce_eager
=
True
,
enforce_eager
=
True
,
...
@@ -131,6 +121,16 @@ def test_models(
...
@@ -131,6 +121,16 @@ def test_models(
gguf_outputs
=
gguf_model
.
generate_greedy_logprobs
(
gguf_outputs
=
gguf_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
# Run unquantized model.
with
vllm_runner
(
model_name
=
model
.
original_model
,
enforce_eager
=
True
,
# faster tests
dtype
=
dtype
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tp_size
)
as
original_model
:
original_outputs
=
original_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
original_outputs
,
outputs_0_lst
=
original_outputs
,
outputs_1_lst
=
gguf_outputs
,
outputs_1_lst
=
gguf_outputs
,
...
...
tests/models/decoder_only/language/test_hybrid.py
View file @
469e903b
...
@@ -11,7 +11,9 @@ from ...utils import check_outputs_equal
...
@@ -11,7 +11,9 @@ from ...utils import check_outputs_equal
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
# This test is for the hybrid models
# This test is for the hybrid models
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
),
os
.
path
.
join
(
models_path_prefix
,
"ibm-ai-platform/Bamba-9B"
)]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
),
os
.
path
.
join
(
models_path_prefix
,
"Zyphra/Zamba2-1.2B-instruct"
)]
# Bamba at Fp32 is too big for the CI (L4 GPU).
# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
@@ -27,29 +29,24 @@ def test_models(
...
@@ -27,29 +29,24 @@ def test_models(
)
->
None
:
)
->
None
:
# numeric error produces different generation
# numeric error produces different generation
if
'
Bamba
'
in
model
:
if
"
Bamba
"
in
model
:
example_prompts
.
pop
(
3
)
example_prompts
.
pop
(
3
)
with
hf_runner
(
model_kwargs
=
{
model
,
"use_mamba_kernels"
:
False
,
# mamba kernels are not installed so HF
dtype
=
dtype
,
# don't use them
model_kwargs
=
{
}
"use_mamba_kernels"
:
if
"Zamba2"
in
model
:
False
,
# mamba kernels are not installed so HF
# Zamba2 HF implementation automatically checks if mamba kernels are
# don't use them
# installed
})
as
hf_model
:
model_kwargs
=
{}
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
model_kwargs
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def
print_model
(
model
):
print
(
model
)
vllm_model
.
apply_model
(
print_model
)
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
...
@@ -119,26 +116,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
...
@@ -119,26 +116,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
def
test_mamba_prefill_chunking
(
hf_runner
,
vllm_runner
,
example_prompts
,
def
test_mamba_prefill_chunking
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
max_tokens
:
int
)
->
None
:
# numeric error during prefill chu
c
king produces different generation
# numeric error during prefill chu
n
king produces different generation
# compared to w/o prefill chunking for those examples, removed them for now
# compared to w/o prefill chunking for those examples, removed them for now
if
'
Jamba
'
in
model
:
if
"
Jamba
"
in
model
:
example_prompts
.
pop
(
7
)
example_prompts
.
pop
(
7
)
example_prompts
.
pop
(
2
)
example_prompts
.
pop
(
2
)
example_prompts
.
pop
(
1
)
example_prompts
.
pop
(
1
)
elif
'
Bamba
'
in
model
:
elif
"
Bamba
"
in
model
:
example_prompts
.
pop
(
6
)
example_prompts
.
pop
(
6
)
example_prompts
.
pop
(
3
)
example_prompts
.
pop
(
3
)
example_prompts
.
pop
(
2
)
example_prompts
.
pop
(
2
)
dtype
=
"half"
# use a different dtype for Bamba
dtype
=
"half"
# use a different dtype for Bamba
elif
"Zamba2"
in
model
:
with
hf_runner
(
example_prompts
.
pop
(
7
)
model
,
dtype
=
"half"
dtype
=
dtype
,
model_kwargs
=
{
model_kwargs
=
{
"use_mamba_kernels"
:
"use_mamba_kernels"
:
False
,
# mamba kernels are not installed so HF
False
,
# mamba kernels are not installed so HF
# don't use them
# don't use them
}
})
as
hf_model
:
if
"Zamba2"
in
model
:
# Zamba2 HF implementation automatically checks if mamba kernels are
# installed
model_kwargs
=
{}
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
model_kwargs
)
as
hf_model
:
non_chunked
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
non_chunked
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
...
@@ -194,6 +196,7 @@ def test_parallel_sampling(
...
@@ -194,6 +196,7 @@ def test_parallel_sampling(
)
)
@
pytest
.
mark
.
skip
(
reason
=
"RE-ENABLE: test is currently failing on main."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
20
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
20
])
...
@@ -295,6 +298,7 @@ def test_state_cleanup(
...
@@ -295,6 +298,7 @@ def test_state_cleanup(
"could be related to finished_requests_ids"
)
"could be related to finished_requests_ids"
)
@
pytest
.
mark
.
skip
(
reason
=
"RE-ENABLE: test is currently failing on main."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_multistep
(
def
test_multistep
(
...
@@ -310,6 +314,7 @@ def test_multistep(
...
@@ -310,6 +314,7 @@ def test_multistep(
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
10
,
1
)
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
10
,
1
)
@
pytest
.
mark
.
skip
(
reason
=
"RE-ENABLE: test is currently failing on main."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
...
...
tests/models/decoder_only/language/test_mamba.py
View file @
469e903b
...
@@ -70,13 +70,6 @@ def test_models(
...
@@ -70,13 +70,6 @@ def test_models(
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def
print_model
(
model
):
print
(
model
)
vllm_model
.
apply_model
(
print_model
)
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
...
...
tests/models/decoder_only/language/test_mistral.py
View file @
469e903b
...
@@ -203,6 +203,7 @@ def test_models(
...
@@ -203,6 +203,7 @@ def test_models(
)
)
@
pytest
.
mark
.
skip
(
"RE-ENABLE: test is currently failing on main."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MISTRAL_FORMAT_MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MISTRAL_FORMAT_MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
...
@@ -215,16 +216,6 @@ def test_mistral_format(
...
@@ -215,16 +216,6 @@ def test_mistral_format(
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
None
:
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
tokenizer_mode
=
"auto"
,
load_format
=
"safetensors"
,
config_format
=
"hf"
,
)
as
hf_format_model
:
hf_format_outputs
=
hf_format_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
...
@@ -235,6 +226,16 @@ def test_mistral_format(
...
@@ -235,6 +226,16 @@ def test_mistral_format(
mistral_format_outputs
=
mistral_format_model
.
generate_greedy_logprobs
(
mistral_format_outputs
=
mistral_format_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
tokenizer_mode
=
"auto"
,
load_format
=
"safetensors"
,
config_format
=
"hf"
,
)
as
hf_format_model
:
hf_format_outputs
=
hf_format_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
hf_format_outputs
,
outputs_0_lst
=
hf_format_outputs
,
outputs_1_lst
=
mistral_format_outputs
,
outputs_1_lst
=
mistral_format_outputs
,
...
@@ -263,6 +264,7 @@ def test_mistral_symbolic_languages(
...
@@ -263,6 +264,7 @@ def test_mistral_symbolic_languages(
assert
"�"
not
in
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
assert
"�"
not
in
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
@
pytest
.
mark
.
skip
(
"RE-ENABLE: test is currently failing on main."
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
MISTRAL_FORMAT_MODELS
)
# v1 can't do func calling
MISTRAL_FORMAT_MODELS
)
# v1 can't do func calling
...
...
tests/models/decoder_only/language/test_modelopt.py
View file @
469e903b
...
@@ -5,7 +5,6 @@
...
@@ -5,7 +5,6 @@
Note: these tests will only pass on H100
Note: these tests will only pass on H100
"""
"""
import
os
import
os
from
typing
import
List
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
...
@@ -66,7 +65,7 @@ def test_models(example_prompts, model_name) -> None:
...
@@ -66,7 +65,7 @@ def test_models(example_prompts, model_name) -> None:
for
prompt
in
example_prompts
for
prompt
in
example_prompts
]
]
params
=
SamplingParams
(
max_tokens
=
20
,
temperature
=
0
)
params
=
SamplingParams
(
max_tokens
=
20
,
temperature
=
0
)
generations
:
L
ist
[
str
]
=
[]
generations
:
l
ist
[
str
]
=
[]
# Note: these need to be run 1 at a time due to numerical precision,
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
# since the expected strs were generated this way.
for
prompt
in
formatted_prompts
:
for
prompt
in
formatted_prompts
:
...
...
tests/models/decoder_only/language/test_models.py
View file @
469e903b
...
@@ -3,13 +3,37 @@
...
@@ -3,13 +3,37 @@
Run `pytest tests/models/test_models.py`.
Run `pytest tests/models/test_models.py`.
"""
"""
import
pytest
import
pytest
import
os
import
os
import
torch
from
vllm.platforms
import
current_platform
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
# These have unsupported head_dim for FA. We do not
# not have a clean way to fall back, so we fail with
# a clear msg when it happens.
# https://github.com/vllm-project/vllm/issues/14524
REQUIRES_V0
=
[
"microsoft/phi-2"
,
"stabilityai/stablelm-3b-4e1t"
]
# This list contains the model that are using AITER kernel.
# Skip model that are not using AITER tests.
# When more AITER kernels are added, this list will not be
# needed as all the models will be calling AITER kernels
# in parts of the operators
AITER_MODEL_LIST
=
[
"meta-llama/Llama-3.2-1B-Instruct"
,
"openbmb/MiniCPM3-4B"
,
"Qwen/Qwen-7B"
,
"Qwen/Qwen2.5-0.5B-Instruct"
,
"ehristoforu/Falcon3-MoE-2x7B-Insruct"
,
]
# @maybe_test_rocm_aiter
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model"
,
"model"
,
[
[
...
@@ -65,15 +89,23 @@ from ....utils import models_path_prefix
...
@@ -65,15 +89,23 @@ from ....utils import models_path_prefix
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
@
pytest
.
mark
.
parametrize
(
hf_runner
,
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
vllm_runner
,
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
example_prompts
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
model
:
str
,
use_rocm_aiter
:
bool
,
monkeypatch
)
->
None
:
dtype
:
str
,
max_tokens
:
int
,
if
model
in
REQUIRES_V0
:
num_logprobs
:
int
,
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
)
->
None
:
if
use_rocm_aiter
and
(
model
in
AITER_MODEL_LIST
):
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
elif
use_rocm_aiter
and
model
not
in
AITER_MODEL_LIST
:
# Skip model that are not using AITER tests.
# When more AITER kernels are added, this list will not be
# needed as all the models will be calling AITER kernels
# in parts of the operators
pytest
.
skip
(
f
"Skipping '
{
model
}
' model test with AITER kernel."
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
if
model
.
startswith
(
"THUDM/chatglm3"
):
if
model
.
startswith
(
"THUDM/chatglm3"
):
...
@@ -87,16 +119,16 @@ def test_models(
...
@@ -87,16 +119,16 @@ def test_models(
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def
print_model
(
model
):
print
(
model
)
vllm_model
.
apply_model
(
print_model
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
name_1
=
"vllm"
,
)
)
if
use_rocm_aiter
:
# this is to ensure that vllm engine
# has deallocated the memory before running the next
# unit tests. On ROCm, when using AITER
# the memory might not be deallocated completely
# before running the next test case
torch
.
cuda
.
synchronize
()
tests/models/decoder_only/language/test_nvfp4.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa
"""Tests Model Optimizer nvfp4 models against ground truth generation
Note: these tests will only pass on B200
"""
import
os
from
typing
import
List
import
pytest
from
transformers
import
AutoTokenizer
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
MAX_MODEL_LEN
=
1024
MODELS
=
[
"nvidia/Llama-3.3-70B-Instruct-FP4"
]
EXPECTED_STRS_MAP
=
{
"nvidia/Llama-3.3-70B-Instruct-FP4"
:
[
'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference'
,
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to '
,
'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process'
,
'A neural network is a type of machine learning model inspired by the structure and function of the human brain'
,
'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push'
,
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading'
,
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of'
,
'Here are the translations:
\n\n
* Japanese: (Sasuga no tori ga miwa o ts'
]
}
# This test compares against golden strings for exact match since
# there is no baseline implementation to compare against
# and is unstable w.r.t specifics of the fp4 implementation or
# the hardware being run on.
# Disabled to prevent it from breaking the build
@
pytest
.
mark
.
skip
(
reason
=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system."
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"nvfp4"
),
reason
=
"nvfp4 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
example_prompts
,
model_name
)
->
None
:
model
=
LLM
(
model
=
model_name
,
max_model_len
=
MAX_MODEL_LEN
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
quantization
=
"nvfp4"
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
formatted_prompts
=
[
tokenizer
.
apply_chat_template
([{
"role"
:
"user"
,
"content"
:
prompt
}],
tokenize
=
False
,
add_generation_prompt
=
True
)
for
prompt
in
example_prompts
]
params
=
SamplingParams
(
max_tokens
=
20
,
temperature
=
0
)
generations
:
List
[
str
]
=
[]
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for
prompt
in
formatted_prompts
:
outputs
=
model
.
generate
(
prompt
,
params
)
generations
.
append
(
outputs
[
0
].
outputs
[
0
].
text
)
del
model
print
(
model_name
,
generations
)
expected_strs
=
EXPECTED_STRS_MAP
[
model_name
]
for
i
in
range
(
len
(
example_prompts
)):
generated_str
=
generations
[
i
]
expected_str
=
expected_strs
[
i
]
assert
expected_str
==
generated_str
,
(
f
"Test
{
i
}
:
\n
Expected:
{
expected_str
!
r
}
\n
vLLM:
{
generated_str
!
r
}
"
)
tests/models/decoder_only/vision_language/test_awq.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
,
Type
from
typing
import
Optional
import
os
import
os
import
pytest
import
pytest
...
@@ -21,12 +21,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
...
@@ -21,12 +21,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
def
run_awq_test
(
def
run_awq_test
(
vllm_runner
:
T
ype
[
VllmRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
image_assets
:
_ImageAssets
,
image_assets
:
_ImageAssets
,
source_model
:
str
,
source_model
:
str
,
quant_model
:
str
,
quant_model
:
str
,
*
,
*
,
size_factors
:
L
ist
[
float
],
size_factors
:
l
ist
[
float
],
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
...
@@ -110,7 +110,12 @@ def run_awq_test(
...
@@ -110,7 +110,12 @@ def run_awq_test(
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_awq_models
(
vllm_runner
,
image_assets
,
source_model
,
quant_model
,
def
test_awq_models
(
vllm_runner
,
image_assets
,
source_model
,
quant_model
,
size_factors
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
size_factors
,
dtype
,
max_tokens
,
num_logprobs
,
monkeypatch
)
->
None
:
# Test V1: this test hangs during setup on single-scale input.
# TODO: fixure out why and re-enable this on V1.
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
run_awq_test
(
run_awq_test
(
vllm_runner
,
vllm_runner
,
image_assets
,
image_assets
,
...
...
tests/models/decoder_only/vision_language/test_models.py
View file @
469e903b
...
@@ -6,12 +6,11 @@ import math
...
@@ -6,12 +6,11 @@ import math
import
os
import
os
from
collections
import
defaultdict
from
collections
import
defaultdict
from
pathlib
import
PosixPath
from
pathlib
import
PosixPath
from
typing
import
Type
import
os
import
os
import
pytest
import
pytest
from
packaging.version
import
Version
from
packaging.version
import
Version
from
transformers
import
AutoModelForVision2Seq
from
transformers
import
AutoModelForImageTextToText
,
AutoModelForVision2Seq
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -19,7 +18,7 @@ from vllm.utils import identity
...
@@ -19,7 +18,7 @@ from vllm.utils import identity
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
,
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
,
_VideoAssets
)
_VideoAssets
)
from
....utils
import
(
fork
_new_process_for_each_test
,
large_gpu_mark
,
from
....utils
import
(
create
_new_process_for_each_test
,
large_gpu_mark
,
multi_gpu_marks
)
multi_gpu_marks
)
from
...utils
import
check_outputs_equal
from
...utils
import
check_outputs_equal
from
.vlm_utils
import
custom_inputs
,
model_utils
,
runners
from
.vlm_utils
import
custom_inputs
,
model_utils
,
runners
...
@@ -35,6 +34,16 @@ from ....utils import models_path_prefix
...
@@ -35,6 +34,16 @@ from ....utils import models_path_prefix
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
REQUIRES_V0_MODELS
=
[
# V1 Test: no way to fall back for head_dim = 80
# https://github.com/vllm-project/vllm/issues/14524
"qwen_vl"
,
"h2ovl"
,
"blip2"
,
# V1 Test: not enough KV cache space in C1.
"fuyu"
,
]
# yapf: disable
# yapf: disable
COMMON_BROADCAST_SETTINGS
=
{
COMMON_BROADCAST_SETTINGS
=
{
"test_type"
:
VLMTestType
.
IMAGE
,
"test_type"
:
VLMTestType
.
IMAGE
,
...
@@ -94,7 +103,7 @@ VLM_TEST_SETTINGS = {
...
@@ -94,7 +103,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
convert_assets_to_embeddings
=
model_utils
.
get_llava_embeddings
,
convert_assets_to_embeddings
=
model_utils
.
get_llava_embeddings
,
max_model_len
=
4096
,
max_model_len
=
4096
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
...
@@ -114,14 +123,10 @@ VLM_TEST_SETTINGS = {
...
@@ -114,14 +123,10 @@ VLM_TEST_SETTINGS = {
"stop_sign"
:
"caption es"
,
"stop_sign"
:
"caption es"
,
"cherry_blossom"
:
"What is in the picture?"
,
"cherry_blossom"
:
"What is in the picture?"
,
}),
}),
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForImageTextToText
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
vllm_output_post_proc
=
model_utils
.
paligemma_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
paligemma_vllm_to_hf_output
,
dtype
=
(
"half"
if
current_platform
.
is_cpu
()
or
current_platform
.
is_rocm
()
dtype
=
"bfloat16"
,
else
(
"half"
,
"float"
)),
marks
=
[
pytest
.
mark
.
skip
(
reason
=
"vLLM does not support PrefixLM attention mask"
)],
# noqa: E501
marks
=
[
pytest
.
mark
.
core_model
],
),
),
# TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
# TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
# once we upgraded to transformers>=4.49.0.
# once we upgraded to transformers>=4.49.0.
...
@@ -160,30 +165,30 @@ VLM_TEST_SETTINGS = {
...
@@ -160,30 +165,30 @@ VLM_TEST_SETTINGS = {
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
),
#### Extended model tests
#### Extended model tests
"aria"
:
VLMTestInfo
(
#
"aria": VLMTestInfo(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"rhymes-ai/Aria"
)
],
#
models=["rhymes-ai/Aria"],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
#
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>user
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
#
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<fim_prefix><|img|><fim_suffix>
\n
"
,
#
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
max_model_len
=
4096
,
#
max_model_len=4096,
max_num_seqs
=
2
,
#
max_num_seqs=2,
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
#
auto_cls=AutoModelForImageTextToText,
"stop_sign"
:
"<vlm_image>Please describe the image shortly."
,
#
single_image_prompts=IMAGE_ASSETS.prompts({
"
cherry_blossom
"
:
"<vlm_image>Please
infer the season with reason
."
,
#
"
stop_sign
": "<vlm_image>Please
describe the image shortly
.",
}),
#
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
multi_image_prompt
=
"<vlm_image><vlm_image>Describe the two images shortly."
,
# noqa: E501
#
}),
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
#
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
stop_str
=
[
"<|im_end|>"
],
#
stop_str=["<|im_end|>"],
image_size_factors
=
[(
0.10
,
0.15
)],
#
image_size_factors=[(0.10, 0.15)],
max_tokens
=
64
,
#
max_tokens=64,
marks
=
[
large_gpu_mark
(
min_gb
=
64
)],
#
marks=[large_gpu_mark(min_gb=64)],
),
#
),
"blip2"
:
VLMTestInfo
(
"blip2"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Salesforce/blip2-opt-2.7b"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Salesforce/blip2-opt-2.7b"
)],
test_type
=
VLMTestType
.
IMAGE
,
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"Question:
{
img_prompt
}
Answer:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"Question:
{
img_prompt
}
Answer:"
,
img_idx_to_prompt
=
lambda
idx
:
""
,
img_idx_to_prompt
=
lambda
idx
:
""
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
blip2_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
blip2_vllm_to_hf_output
,
),
),
"chameleon"
:
VLMTestInfo
(
"chameleon"
:
VLMTestInfo
(
...
@@ -192,10 +197,7 @@ VLM_TEST_SETTINGS = {
...
@@ -192,10 +197,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForImageTextToText
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
# For chameleon, we only compare the sequences
# For chameleon, we only compare the sequences
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
...
@@ -215,7 +217,6 @@ VLM_TEST_SETTINGS = {
...
@@ -215,7 +217,6 @@ VLM_TEST_SETTINGS = {
}),
}),
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"images"
),
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
stop_str
=
[
"<|end▁of▁sentence|>"
,
"<|begin▁of▁sentence|>"
],
# noqa: E501
stop_str
=
[
"<|end▁of▁sentence|>"
,
"<|begin▁of▁sentence|>"
],
# noqa: E501
image_size_factors
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
),
(
0.1
,
0.5
,
1.0
)],
image_size_factors
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
),
(
0.1
,
0.5
,
1.0
)],
...
@@ -233,21 +234,44 @@ VLM_TEST_SETTINGS = {
...
@@ -233,21 +234,44 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
""
,
img_idx_to_prompt
=
lambda
idx
:
""
,
max_model_len
=
2048
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
use_tokenizer_eos
=
True
,
use_tokenizer_eos
=
True
,
vllm_output_post_proc
=
model_utils
.
fuyu_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
fuyu_vllm_to_hf_output
,
num_logprobs
=
10
,
num_logprobs
=
10
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
),
),
"glm4"
:
VLMTestInfo
(
"gemma3"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-3-4b-it"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<bos><start_of_turn>user
\n
{
img_prompt
}
<end_of_turn>
\n
<start_of_turn>model
\n
"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<start_of_image>What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<start_of_image>What is the season?"
,
# noqa: E501
}),
multi_image_prompt
=
"<start_of_image><start_of_image>Describe the two images in detail."
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"do_pan_and_scan"
:
True
}},
patch_hf_runner
=
model_utils
.
gemma3_patch_hf_runner
,
),
"glm4v"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
)],
test_type
=
VLMTestType
.
IMAGE
,
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
identity
,
prompt_formatter
=
lambda
img_prompt
:
f
"<|user|>
\n
{
img_prompt
}
<|assistant|>"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
""
,
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?"
,
# noqa: E501
}),
max_model_len
=
2048
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
dtype
=
"bfloat16"
,
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
patch_hf_runner
=
model_utils
.
glm_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
glm4v_patch_hf_runner
,
# The image embeddings match with HF but the outputs of the language
# decoder are only consistent up to 2 decimal places.
# So, we need to reduce the number of tokens for the test to pass.
max_tokens
=
8
,
num_logprobs
=
10
,
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
),
"h2ovl"
:
VLMTestInfo
(
"h2ovl"
:
VLMTestInfo
(
...
@@ -263,7 +287,6 @@ VLM_TEST_SETTINGS = {
...
@@ -263,7 +287,6 @@ VLM_TEST_SETTINGS = {
}),
}),
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
max_model_len
=
8192
,
max_model_len
=
8192
,
dtype
=
"bfloat16"
,
use_tokenizer_eos
=
True
,
use_tokenizer_eos
=
True
,
num_logprobs
=
10
,
num_logprobs
=
10
,
patch_hf_runner
=
model_utils
.
h2ovl_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
h2ovl_patch_hf_runner
,
...
@@ -275,7 +298,7 @@ VLM_TEST_SETTINGS = {
...
@@ -275,7 +298,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
"<image>"
,
img_idx_to_prompt
=
lambda
idx
:
"<image>"
,
max_model_len
=
8192
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
hf_output_post_proc
=
model_utils
.
idefics3_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
idefics3_trunc_hf_output
,
),
),
"intern_vl"
:
VLMTestInfo
(
"intern_vl"
:
VLMTestInfo
(
...
@@ -292,10 +315,6 @@ VLM_TEST_SETTINGS = {
...
@@ -292,10 +315,6 @@ VLM_TEST_SETTINGS = {
}),
}),
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
max_model_len
=
4096
,
max_model_len
=
4096
,
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
dtype
=
"bfloat16"
,
use_tokenizer_eos
=
True
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
),
),
...
@@ -304,7 +323,7 @@ VLM_TEST_SETTINGS = {
...
@@ -304,7 +323,7 @@ VLM_TEST_SETTINGS = {
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
max_model_len
=
10240
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
...
@@ -319,9 +338,6 @@ VLM_TEST_SETTINGS = {
...
@@ -319,9 +338,6 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
num_video_frames
=
16
,
num_video_frames
=
16
,
max_model_len
=
16384
,
max_model_len
=
16384
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values_videos"
),
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
...
@@ -346,11 +362,8 @@ VLM_TEST_SETTINGS = {
...
@@ -346,11 +362,8 @@ VLM_TEST_SETTINGS = {
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|start_header_id|>user<|end_header_id|>
\n\n
{
img_prompt
}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
prompt_formatter
=
lambda
img_prompt
:
f
"<|start_header_id|>user<|end_header_id|>
\n\n
{
img_prompt
}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
max_model_len
=
4096
,
max_model_len
=
4096
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
get_stop_token_ids
=
lambda
tok
:
[
128009
],
get_stop_token_ids
=
lambda
tok
:
[
128009
],
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
patch_hf_runner
=
model_utils
.
mantis_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
mantis_patch_hf_runner
,
marks
=
[
marks
=
[
...
@@ -368,8 +381,8 @@ VLM_TEST_SETTINGS = {
...
@@ -368,8 +381,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
[
tok
.
eos_id
,
tok
.
eot_id
],
get_stop_token_ids
=
lambda
tok
:
[
tok
.
eos_id
,
tok
.
eot_id
],
postprocess_inputs
=
model_utils
.
wrap_inputs_post_processor
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmv_25_patch_hf_runner
,
),
),
"minicpmo_26"
:
VLMTestInfo
(
"minicpmo_26"
:
VLMTestInfo
(
models
=
[
"openbmb/MiniCPM-o-2_6"
],
models
=
[
"openbmb/MiniCPM-o-2_6"
],
...
@@ -379,11 +392,8 @@ VLM_TEST_SETTINGS = {
...
@@ -379,11 +392,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
postprocess_inputs
=
model_utils
.
ignore_inputs_post_processor
(
"image_sizes"
),
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmo_patch_hf_runner
patch_hf_runner
=
model_utils
.
minicpmo_
26_
patch_hf_runner
,
),
),
"minicpmv_26"
:
VLMTestInfo
(
"minicpmv_26"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-V-2_6"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-V-2_6"
)],
...
@@ -393,10 +403,8 @@ VLM_TEST_SETTINGS = {
...
@@ -393,10 +403,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
postprocess_inputs
=
model_utils
.
ignore_inputs_post_processor
(
"image_sizes"
),
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmv_26_patch_hf_runner
,
),
),
"molmo"
:
VLMTestInfo
(
"molmo"
:
VLMTestInfo
(
models
=
[
"allenai/Molmo-7B-D-0924"
],
models
=
[
"allenai/Molmo-7B-D-0924"
],
...
@@ -405,7 +413,6 @@ VLM_TEST_SETTINGS = {
...
@@ -405,7 +413,6 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
molmo_post_processor
,
),
),
# Tests for phi3v currently live in another file because of a bug in
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
# transformers. Once this issue is fixed, we can enable them here instead.
...
@@ -431,7 +438,7 @@ VLM_TEST_SETTINGS = {
...
@@ -431,7 +438,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
"[IMG]"
,
img_idx_to_prompt
=
lambda
idx
:
"[IMG]"
,
max_model_len
=
8192
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
),
),
"qwen_vl"
:
VLMTestInfo
(
"qwen_vl"
:
VLMTestInfo
(
...
@@ -449,10 +456,7 @@ VLM_TEST_SETTINGS = {
...
@@ -449,10 +456,7 @@ VLM_TEST_SETTINGS = {
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
)],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForImageTextToText
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
comparator
=
check_outputs_equal
,
comparator
=
check_outputs_equal
,
...
@@ -463,7 +467,7 @@ VLM_TEST_SETTINGS = {
...
@@ -463,7 +467,7 @@ VLM_TEST_SETTINGS = {
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
**
COMMON_BROADCAST_SETTINGS
# type: ignore
**
COMMON_BROADCAST_SETTINGS
# type: ignore
...
@@ -472,7 +476,7 @@ VLM_TEST_SETTINGS = {
...
@@ -472,7 +476,7 @@ VLM_TEST_SETTINGS = {
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)],
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
max_model_len
=
10240
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
**
COMMON_BROADCAST_SETTINGS
# type: ignore
**
COMMON_BROADCAST_SETTINGS
# type: ignore
...
@@ -497,9 +501,6 @@ VLM_TEST_SETTINGS = {
...
@@ -497,9 +501,6 @@ VLM_TEST_SETTINGS = {
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
16384
,
max_model_len
=
16384
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
...
@@ -509,6 +510,19 @@ VLM_TEST_SETTINGS = {
...
@@ -509,6 +510,19 @@ VLM_TEST_SETTINGS = {
limit_mm_per_prompt
=
{
"image"
:
4
},
limit_mm_per_prompt
=
{
"image"
:
4
},
)],
)],
),
),
# regression test for https://github.com/vllm-project/vllm/issues/15122
"qwen2_5_vl-windows-attention"
:
VLMTestInfo
(
models
=
[
"Qwen/Qwen2.5-VL-3B-Instruct"
],
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
windows_attention_image_qwen2_5_vl
(),
limit_mm_per_prompt
=
{
"image"
:
1
},
)],
),
}
}
# yapf: enable
# yapf: enable
...
@@ -560,13 +574,15 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
...
@@ -560,13 +574,15 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
IMAGE
,
test_type
=
VLMTestType
.
IMAGE
,
fork
_new_process_for_each_test
=
False
,
create
_new_process_for_each_test
=
False
,
))
))
def
test_single_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
def
test_single_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
image_assets
:
_ImageAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_single_image_test
(
runners
.
run_single_image_test
(
tmp_path
=
tmp_path
,
tmp_path
=
tmp_path
,
...
@@ -583,13 +599,15 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
...
@@ -583,13 +599,15 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
fork
_new_process_for_each_test
=
False
,
create
_new_process_for_each_test
=
False
,
))
))
def
test_multi_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
def
test_multi_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
image_assets
:
_ImageAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_multi_image_test
(
runners
.
run_multi_image_test
(
tmp_path
=
tmp_path
,
tmp_path
=
tmp_path
,
...
@@ -606,13 +624,15 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
...
@@ -606,13 +624,15 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
EMBEDDING
,
test_type
=
VLMTestType
.
EMBEDDING
,
fork
_new_process_for_each_test
=
False
,
create
_new_process_for_each_test
=
False
,
))
))
def
test_image_embedding_models
(
model_type
:
str
,
def
test_image_embedding_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
image_assets
:
_ImageAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_embedding_test
(
runners
.
run_embedding_test
(
model_test_info
=
model_test_info
,
model_test_info
=
model_test_info
,
...
@@ -628,11 +648,13 @@ def test_image_embedding_models(model_type: str,
...
@@ -628,11 +648,13 @@ def test_image_embedding_models(model_type: str,
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
VIDEO
,
test_type
=
VLMTestType
.
VIDEO
,
fork
_new_process_for_each_test
=
False
,
create
_new_process_for_each_test
=
False
,
))
))
def
test_video_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
def
test_video_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
video_assets
:
_VideoAssets
):
video_assets
:
_VideoAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_video_test
(
runners
.
run_video_test
(
model_test_info
=
model_test_info
,
model_test_info
=
model_test_info
,
...
@@ -648,14 +670,17 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
...
@@ -648,14 +670,17 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
fork
_new_process_for_each_test
=
False
,
create
_new_process_for_each_test
=
False
,
))
))
def
test_custom_inputs_models
(
def
test_custom_inputs_models
(
model_type
:
str
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
,
):
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_custom_inputs_test
(
runners
.
run_custom_inputs_test
(
model_test_info
=
model_test_info
,
model_test_info
=
model_test_info
,
...
@@ -671,14 +696,16 @@ def test_custom_inputs_models(
...
@@ -671,14 +696,16 @@ def test_custom_inputs_models(
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
IMAGE
,
test_type
=
VLMTestType
.
IMAGE
,
fork
_new_process_for_each_test
=
True
,
create
_new_process_for_each_test
=
True
,
))
))
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_single_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
def
test_single_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
image_assets
:
_ImageAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_single_image_test
(
runners
.
run_single_image_test
(
tmp_path
=
tmp_path
,
tmp_path
=
tmp_path
,
...
@@ -695,14 +722,16 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
...
@@ -695,14 +722,16 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
fork
_new_process_for_each_test
=
True
,
create
_new_process_for_each_test
=
True
,
))
))
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_multi_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
def
test_multi_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
image_assets
:
_ImageAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_multi_image_test
(
runners
.
run_multi_image_test
(
tmp_path
=
tmp_path
,
tmp_path
=
tmp_path
,
...
@@ -719,14 +748,16 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
...
@@ -719,14 +748,16 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
EMBEDDING
,
test_type
=
VLMTestType
.
EMBEDDING
,
fork
_new_process_for_each_test
=
True
,
create
_new_process_for_each_test
=
True
,
))
))
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_image_embedding_models_heavy
(
model_type
:
str
,
def
test_image_embedding_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_ImageAssets
):
image_assets
:
_ImageAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_embedding_test
(
runners
.
run_embedding_test
(
model_test_info
=
model_test_info
,
model_test_info
=
model_test_info
,
...
@@ -742,12 +773,14 @@ def test_image_embedding_models_heavy(model_type: str,
...
@@ -742,12 +773,14 @@ def test_image_embedding_models_heavy(model_type: str,
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
VIDEO
,
test_type
=
VLMTestType
.
VIDEO
,
fork
_new_process_for_each_test
=
True
,
create
_new_process_for_each_test
=
True
,
))
))
def
test_video_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
def
test_video_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
video_assets
:
_VideoAssets
):
video_assets
:
_VideoAssets
,
monkeypatch
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_video_test
(
runners
.
run_video_test
(
model_test_info
=
model_test_info
,
model_test_info
=
model_test_info
,
...
@@ -763,15 +796,18 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
...
@@ -763,15 +796,18 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
get_parametrized_options
(
get_parametrized_options
(
VLM_TEST_SETTINGS
,
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
fork
_new_process_for_each_test
=
True
,
create
_new_process_for_each_test
=
True
,
))
))
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_custom_inputs_models_heavy
(
def
test_custom_inputs_models_heavy
(
model_type
:
str
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
,
):
):
if
model_type
in
REQUIRES_V0_MODELS
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
model_test_info
=
VLM_TEST_SETTINGS
[
model_type
]
runners
.
run_custom_inputs_test
(
runners
.
run_custom_inputs_test
(
model_test_info
=
model_test_info
,
model_test_info
=
model_test_info
,
...
...
tests/models/decoder_only/vision_language/test_phi3v.py
View file @
469e903b
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
import
os
import
os
import
re
import
re
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
Optional
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
...
@@ -26,7 +26,7 @@ HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these
...
@@ -26,7 +26,7 @@ HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)]
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)]
def
vllm_to_hf_output
(
vllm_output
:
T
uple
[
L
ist
[
int
],
str
,
def
vllm_to_hf_output
(
vllm_output
:
t
uple
[
l
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
Optional
[
SampleLogprobs
]],
model
:
str
):
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
"""Sanitize vllm output to be comparable with hf output."""
...
@@ -56,9 +56,9 @@ if current_platform.is_rocm():
...
@@ -56,9 +56,9 @@ if current_platform.is_rocm():
def
run_test
(
def
run_test
(
hf_runner
:
T
ype
[
HfRunner
],
hf_runner
:
t
ype
[
HfRunner
],
vllm_runner
:
T
ype
[
VllmRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
inputs
:
L
ist
[
T
uple
[
L
ist
[
str
],
PromptImageInput
]],
inputs
:
l
ist
[
t
uple
[
l
ist
[
str
],
PromptImageInput
]],
model
:
str
,
model
:
str
,
*
,
*
,
dtype
:
str
,
dtype
:
str
,
...
...
tests/models/decoder_only/vision_language/test_phi4mm.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
os
import
re
from
typing
import
Optional
import
pytest
from
huggingface_hub
import
snapshot_download
from
transformers
import
AutoTokenizer
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
large_gpu_test
from
...utils
import
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|user|>
\n
<|image_1|>
\n
What's the content of the image?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
"cherry_blossom"
:
"<|user|>
\n
<|image_1|>
\n
Please infer the season with reason in details.<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT
=
"<|user|>
\n
<|image_1|>
\n
<|image_2|>
\n
Describe these images.<|end|>
\n
<|assistant|>
\n
"
# noqa: E501
model_path
=
snapshot_download
(
"microsoft/Phi-4-multimodal-instruct"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
models
=
[
model_path
]
def
vllm_to_hf_output
(
vllm_output
:
tuple
[
list
[
int
],
str
,
Optional
[
SampleLogprobs
]],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
_
,
output_str
,
out_logprobs
=
vllm_output
output_str_without_image
=
re
.
sub
(
r
"(<\|image_\d+\|>)+"
,
""
,
output_str
)
assert
output_str_without_image
[
0
]
==
" "
output_str_without_image
=
output_str_without_image
[
1
:]
hf_output_str
=
output_str_without_image
+
"<|end|><|endoftext|>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
hf_output_ids
=
tokenizer
.
encode
(
output_str_without_image
)
assert
hf_output_ids
[
0
]
==
1
hf_output_ids
=
hf_output_ids
[
1
:]
return
hf_output_ids
,
hf_output_str
,
out_logprobs
target_dtype
=
"half"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if
current_platform
.
is_rocm
():
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
def
run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
list
[
tuple
[
list
[
str
],
PromptImageInput
]],
model
:
str
,
*
,
max_model_len
:
int
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
task
=
"generate"
,
max_model_len
=
max_model_len
,
max_num_seqs
=
2
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enable_lora
=
True
,
max_lora_rank
=
320
,
gpu_memory_utilization
=
0.8
,
# set to 0.8 to avoid OOM in CI
enforce_eager
=
True
,
)
as
vllm_model
:
lora_request
=
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)
vllm_model
.
model
.
llm_engine
.
add_lora
(
lora_request
=
lora_request
)
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs
=
{
"_attn_implementation"
:
"eager"
}
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
hf_model_kwargs
)
as
hf_model
:
eos_token_id
=
hf_model
.
processor
.
tokenizer
.
eos_token_id
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
eos_token_id
=
eos_token_id
,
num_logits_to_keep
=
0
)
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
# Since we use _attn_implementation="eager" for hf_runner, there is more
# significant numerical difference. The basic `logprobs=5` fails to pass.
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.7
,
0.75
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
4096
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
# [],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
10000
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
xfail
(
reason
=
"Phi-4-MM multi-image inference is divergent with hf model."
)
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
])
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_case
,
model
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/test_pixtral.py
View file @
469e903b
...
@@ -4,9 +4,8 @@
...
@@ -4,9 +4,8 @@
Run `pytest tests/models/test_mistral.py`.
Run `pytest tests/models/test_mistral.py`.
"""
"""
import
json
import
json
import
uuid
from
dataclasses
import
asdict
from
dataclasses
import
asdict
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
import
os
import
os
import
pytest
import
pytest
...
@@ -17,8 +16,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
...
@@ -17,8 +16,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from
mistral_common.tokens.tokenizers.multimodal
import
image_from_chunk
from
mistral_common.tokens.tokenizers.multimodal
import
image_from_chunk
from
transformers
import
AutoProcessor
from
transformers
import
AutoProcessor
from
vllm
import
(
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
,
from
vllm
import
RequestOutput
,
SamplingParams
,
TextPrompt
,
TokensPrompt
TextPrompt
,
TokensPrompt
)
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.sequence
import
Logprob
,
SampleLogprobs
from
vllm.sequence
import
Logprob
,
SampleLogprobs
...
@@ -30,8 +28,11 @@ from ....utils import models_path_prefix
...
@@ -30,8 +28,11 @@ from ....utils import models_path_prefix
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
_typeshed
import
StrPath
from
_typeshed
import
StrPath
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
)]
PIXTRAL_ID
=
"mistralai/Pixtral-12B-2409"
#todo
MISTRAL_SMALL_3_1_ID
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
PIXTRAL_ID
),
os
.
path
.
join
(
models_path_prefix
,
MISTRAL_SMALL_3_1_ID
)]
IMG_URLS
=
[
IMG_URLS
=
[
"https://picsum.photos/id/237/400/300"
,
"https://picsum.photos/id/237/400/300"
,
"https://picsum.photos/id/231/200/300"
,
"https://picsum.photos/id/231/200/300"
,
...
@@ -41,7 +42,7 @@ IMG_URLS = [
...
@@ -41,7 +42,7 @@ IMG_URLS = [
PROMPT
=
"Describe each image in one short sentence."
PROMPT
=
"Describe each image in one short sentence."
def
_create_msg_format
(
urls
:
L
ist
[
str
])
->
L
ist
[
D
ict
[
str
,
Any
]]:
def
_create_msg_format
(
urls
:
l
ist
[
str
])
->
l
ist
[
d
ict
[
str
,
Any
]]:
return
[{
return
[{
"role"
:
"role"
:
"user"
,
"user"
,
...
@@ -57,7 +58,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
...
@@ -57,7 +58,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
}]
}]
def
_create_msg_format_hf
(
urls
:
L
ist
[
str
])
->
L
ist
[
D
ict
[
str
,
Any
]]:
def
_create_msg_format_hf
(
urls
:
l
ist
[
str
])
->
l
ist
[
d
ict
[
str
,
Any
]]:
return
[{
return
[{
"role"
:
"role"
:
"user"
,
"user"
,
...
@@ -71,7 +72,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
...
@@ -71,7 +72,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
}]
}]
def
_create_engine_inputs
(
urls
:
L
ist
[
str
])
->
TokensPrompt
:
def
_create_engine_inputs
(
urls
:
l
ist
[
str
])
->
TokensPrompt
:
msg
=
_create_msg_format
(
urls
)
msg
=
_create_msg_format
(
urls
)
tokenizer
=
MistralTokenizer
.
from_model
(
"pixtral"
)
tokenizer
=
MistralTokenizer
.
from_model
(
"pixtral"
)
...
@@ -92,7 +93,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
...
@@ -92,7 +93,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
return
engine_inputs
return
engine_inputs
def
_create_engine_inputs_hf
(
urls
:
L
ist
[
str
])
->
TextPrompt
:
def
_create_engine_inputs_hf
(
urls
:
l
ist
[
str
])
->
TextPrompt
:
msg
=
_create_msg_format_hf
(
urls
)
msg
=
_create_msg_format_hf
(
urls
)
tokenizer
=
AutoProcessor
.
from_pretrained
(
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
))
tokenizer
=
AutoProcessor
.
from_pretrained
(
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
))
...
@@ -128,10 +129,12 @@ MAX_MODEL_LEN = [8192, 65536]
...
@@ -128,10 +129,12 @@ MAX_MODEL_LEN = [8192, 65536]
FIXTURES_PATH
=
VLLM_PATH
/
"tests/models/fixtures"
FIXTURES_PATH
=
VLLM_PATH
/
"tests/models/fixtures"
assert
FIXTURES_PATH
.
exists
()
assert
FIXTURES_PATH
.
exists
()
FIXTURE_LOGPROBS_CHAT
=
FIXTURES_PATH
/
"pixtral_chat.json"
FIXTURE_LOGPROBS_CHAT
=
{
FIXTURE_LOGPROBS_ENGINE
=
FIXTURES_PATH
/
"pixtral_chat_engine.json"
PIXTRAL_ID
:
FIXTURES_PATH
/
"pixtral_chat.json"
,
MISTRAL_SMALL_3_1_ID
:
FIXTURES_PATH
/
"mistral_small_3_chat.json"
,
}
OutputsLogprobs
=
L
ist
[
T
uple
[
L
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
OutputsLogprobs
=
l
ist
[
t
uple
[
l
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
# For the test author to store golden output in JSON
# For the test author to store golden output in JSON
...
@@ -169,12 +172,12 @@ def test_chat(
...
@@ -169,12 +172,12 @@ def test_chat(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
)
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
[
model
])
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
tokenizer_mode
=
"mistral"
,
tokenizer_mode
=
"mistral"
,
enable_chunked_prefill
=
False
,
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
)
as
vllm_model
:
...
@@ -186,70 +189,40 @@ def test_chat(
...
@@ -186,70 +189,40 @@ def test_chat(
outputs
.
extend
(
output
)
outputs
.
extend
(
output
)
logprobs
=
vllm_runner
.
_final_steps_generate_w_logprobs
(
outputs
)
logprobs
=
vllm_runner
.
_final_steps_generate_w_logprobs
(
outputs
)
# Remove last `None` prompt_logprobs to compare with fixture
for
i
in
range
(
len
(
logprobs
)):
assert
logprobs
[
i
][
-
1
]
is
None
logprobs
[
i
]
=
logprobs
[
i
][:
-
1
]
check_logprobs_close
(
outputs_0_lst
=
EXPECTED_CHAT_LOGPROBS
,
check_logprobs_close
(
outputs_0_lst
=
EXPECTED_CHAT_LOGPROBS
,
outputs_1_lst
=
logprobs
,
outputs_1_lst
=
logprobs
,
name_0
=
"h100_ref"
,
name_0
=
"h100_ref"
,
name_1
=
"output"
)
name_1
=
"output"
)
@
large_gpu_test
(
min_gb
=
80
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
def
test_model_engine
(
vllm_runner
,
model
:
str
,
dtype
:
str
)
->
None
:
EXPECTED_ENGINE_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_ENGINE
)
args
=
EngineArgs
(
model
=
model
,
tokenizer_mode
=
"mistral"
,
enable_chunked_prefill
=
False
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
dtype
=
dtype
,
)
engine
=
LLMEngine
.
from_engine_args
(
args
)
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
0
],
SAMPLING_PARAMS
)
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
1
],
SAMPLING_PARAMS
)
outputs
=
[]
count
=
0
while
True
:
out
=
engine
.
step
()
count
+=
1
for
request_output
in
out
:
if
request_output
.
finished
:
outputs
.
append
(
request_output
)
if
count
==
2
:
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
2
],
SAMPLING_PARAMS
)
if
not
engine
.
has_unfinished_requests
():
break
logprobs
=
vllm_runner
.
_final_steps_generate_w_logprobs
(
outputs
)
check_logprobs_close
(
outputs_0_lst
=
EXPECTED_ENGINE_LOGPROBS
,
outputs_1_lst
=
logprobs
,
name_0
=
"h100_ref"
,
name_1
=
"output"
)
@
large_gpu_test
(
min_gb
=
48
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"prompt,expected_ranges"
,
"prompt,expected_ranges"
,
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[{
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[{
"offset"
:
1
0
,
"offset"
:
1
1
,
"length"
:
494
"length"
:
494
}]),
}]),
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[{
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[{
"offset"
:
1
0
,
"offset"
:
1
1
,
"length"
:
266
"length"
:
266
},
{
},
{
"offset"
:
27
6
,
"offset"
:
27
7
,
"length"
:
1056
"length"
:
1056
},
{
},
{
"offset"
:
133
2
,
"offset"
:
133
3
,
"length"
:
418
"length"
:
418
}])])
}])])
def
test_multi_modal_placeholders
(
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
,
vllm_runner
,
prompt
,
expected_ranges
:
list
[
PlaceholderRange
])
->
None
:
expected_ranges
:
list
[
PlaceholderRange
],
monkeypatch
)
->
None
:
# This placeholder checking test only works with V0 engine
# where `multi_modal_placeholders` is returned with `RequestOutput`
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
with
vllm_runner
(
with
vllm_runner
(
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
),
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
),
max_model_len
=
8192
,
max_model_len
=
8192
,
...
...
tests/models/decoder_only/vision_language/test_qwen2_vl.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
List
,
Optional
,
Tuple
,
Type
,
TypedDict
,
Union
from
typing
import
Any
,
Optional
,
TypedDict
,
Union
import
os
import
os
import
numpy.typing
as
npt
import
numpy.typing
as
npt
...
@@ -16,6 +16,15 @@ from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
...
@@ -16,6 +16,15 @@ from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)]
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)]
target_dtype
=
"half"
target_dtype
=
"half"
...
@@ -71,21 +80,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
...
@@ -71,21 +80,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
def
batch_make_image_embeddings
(
def
batch_make_image_embeddings
(
image_batches
:
L
ist
[
Union
[
Image
.
Image
,
L
ist
[
Image
.
Image
]]],
processor
,
image_batches
:
l
ist
[
Union
[
Image
.
Image
,
l
ist
[
Image
.
Image
]]],
processor
,
llm
:
VllmRunner
)
->
L
ist
[
Qwen2VLPromptImageEmbeddingInput
]:
llm
:
VllmRunner
)
->
l
ist
[
Qwen2VLPromptImageEmbeddingInput
]:
"""batched image embeddings for Qwen2-VL
"""batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch,
This will infer all images' embeddings in a single batch,
and split the result according to input batches.
and split the result according to input batches.
image_batches:
image_batches:
- Single-image batches: `
L
ist[Image.Image]`
- Single-image batches: `
l
ist[Image.Image]`
- Multiple-image batches: `
L
ist[
L
ist[Image.Image]]]`
- Multiple-image batches: `
l
ist[
l
ist[Image.Image]]]`
returns: `
L
ist[Qwen2VLPromptImageEmbeddingInput]`
returns: `
l
ist[Qwen2VLPromptImageEmbeddingInput]`
"""
"""
image_batches_
:
L
ist
[
Any
]
=
image_batches
[:]
image_batches_
:
l
ist
[
Any
]
=
image_batches
[:]
# convert single-image batches to multiple-image batches
# convert single-image batches to multiple-image batches
for
idx
in
range
(
len
(
image_batches_
)):
for
idx
in
range
(
len
(
image_batches_
)):
...
@@ -95,7 +104,7 @@ def batch_make_image_embeddings(
...
@@ -95,7 +104,7 @@ def batch_make_image_embeddings(
assert
isinstance
(
image_batches_
[
idx
],
list
)
assert
isinstance
(
image_batches_
[
idx
],
list
)
# append all images into a list (as a batch)
# append all images into a list (as a batch)
images
:
L
ist
[
Image
.
Image
]
=
[]
images
:
l
ist
[
Image
.
Image
]
=
[]
for
image_batch
in
image_batches_
:
for
image_batch
in
image_batches_
:
images
+=
image_batch
images
+=
image_batch
...
@@ -120,10 +129,11 @@ def batch_make_image_embeddings(
...
@@ -120,10 +129,11 @@ def batch_make_image_embeddings(
return
visual
(
pixel_values_on_device
,
return
visual
(
pixel_values_on_device
,
grid_thw
=
image_grid_thw_on_device
)
grid_thw
=
image_grid_thw_on_device
)
# V1 Test: this calls a V0 internal.
image_embeds
=
torch
.
concat
(
llm
.
apply_model
(
get_image_embeds
))
image_embeds
=
torch
.
concat
(
llm
.
apply_model
(
get_image_embeds
))
# split into original batches
# split into original batches
result
:
L
ist
[
Qwen2VLPromptImageEmbeddingInput
]
=
[]
result
:
l
ist
[
Qwen2VLPromptImageEmbeddingInput
]
=
[]
image_counter
=
0
image_counter
=
0
embed_counter
=
0
embed_counter
=
0
for
image_batch
in
image_batches_
:
for
image_batch
in
image_batches_
:
...
@@ -155,7 +165,7 @@ def batch_make_image_embeddings(
...
@@ -155,7 +165,7 @@ def batch_make_image_embeddings(
def
batch_make_video_embeddings
(
def
batch_make_video_embeddings
(
video_batches
:
PromptVideoInput
,
processor
,
video_batches
:
PromptVideoInput
,
processor
,
llm
:
VllmRunner
)
->
L
ist
[
Qwen2VLPromptVideoEmbeddingInput
]:
llm
:
VllmRunner
)
->
l
ist
[
Qwen2VLPromptVideoEmbeddingInput
]:
"""batched video embeddings for Qwen2-VL
"""batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames.
A NDArray represents a single video's all frames.
...
@@ -164,21 +174,21 @@ def batch_make_video_embeddings(
...
@@ -164,21 +174,21 @@ def batch_make_video_embeddings(
and split the result according to input batches.
and split the result according to input batches.
video_batches:
video_batches:
- Single-video batches: `
L
ist[NDArray]`
- Single-video batches: `
l
ist[NDArray]`
- Multiple-video batches: `
L
ist[
L
ist[NDArray]]`
- Multiple-video batches: `
l
ist[
l
ist[NDArray]]`
"""
"""
video_batches_
:
L
ist
[
Any
]
=
video_batches
[:]
video_batches_
:
l
ist
[
Any
]
=
video_batches
[:]
for
idx
in
range
(
len
(
video_batches_
)):
for
idx
in
range
(
len
(
video_batches_
)):
if
not
isinstance
(
video_batches_
[
idx
],
list
):
if
not
isinstance
(
video_batches_
[
idx
],
list
):
single_video_batch
:
L
ist
[
npt
.
NDArray
]
=
[
video_batches_
[
idx
]]
single_video_batch
:
l
ist
[
npt
.
NDArray
]
=
[
video_batches_
[
idx
]]
video_batches_
[
idx
]
=
single_video_batch
video_batches_
[
idx
]
=
single_video_batch
assert
isinstance
(
video_batches_
[
idx
],
list
)
assert
isinstance
(
video_batches_
[
idx
],
list
)
# append all videos into a list (as a batch)
# append all videos into a list (as a batch)
videos
:
L
ist
[
npt
.
NDArray
]
=
[]
videos
:
l
ist
[
npt
.
NDArray
]
=
[]
for
video_batch
in
video_batches_
:
for
video_batch
in
video_batches_
:
videos
+=
video_batch
videos
+=
video_batch
...
@@ -203,10 +213,11 @@ def batch_make_video_embeddings(
...
@@ -203,10 +213,11 @@ def batch_make_video_embeddings(
return
visual
(
pixel_values_on_device
,
return
visual
(
pixel_values_on_device
,
grid_thw
=
video_grid_thw_on_device
)
grid_thw
=
video_grid_thw_on_device
)
# V1 Test: this calls a V0 internal.
video_embeds
=
torch
.
concat
(
llm
.
apply_model
(
get_image_embeds
))
video_embeds
=
torch
.
concat
(
llm
.
apply_model
(
get_image_embeds
))
# split into original batches
# split into original batches
result
:
L
ist
[
Qwen2VLPromptVideoEmbeddingInput
]
=
[]
result
:
l
ist
[
Qwen2VLPromptVideoEmbeddingInput
]
=
[]
video_counter
=
0
video_counter
=
0
embed_counter
=
0
embed_counter
=
0
for
video_batch
in
video_batches_
:
for
video_batch
in
video_batches_
:
...
@@ -237,8 +248,8 @@ def batch_make_video_embeddings(
...
@@ -237,8 +248,8 @@ def batch_make_video_embeddings(
def
run_embedding_input_test
(
def
run_embedding_input_test
(
vllm_runner
:
T
ype
[
VllmRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
inputs
:
L
ist
[
T
uple
[
L
ist
[
str
],
PromptImageInput
,
PromptVideoInput
]],
inputs
:
l
ist
[
t
uple
[
l
ist
[
str
],
PromptImageInput
,
PromptVideoInput
]],
model
:
str
,
model
:
str
,
*
,
*
,
dtype
:
str
,
dtype
:
str
,
...
@@ -255,7 +266,6 @@ def run_embedding_input_test(
...
@@ -255,7 +266,6 @@ def run_embedding_input_test(
processor
=
AutoProcessor
.
from_pretrained
(
model
)
processor
=
AutoProcessor
.
from_pretrained
(
model
)
# NOTE:
# max_model_len should be greater than image_feature_size
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
task
=
"generate"
,
task
=
"generate"
,
...
@@ -325,8 +335,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
...
@@ -325,8 +335,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
num_logprobs
:
int
)
->
None
:
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
:
L
ist
[
T
uple
[
inputs_per_case
:
l
ist
[
t
uple
[
L
ist
[
str
],
PromptImageInput
,
PromptVideoInput
]]
=
[(
l
ist
[
str
],
PromptImageInput
,
PromptVideoInput
]]
=
[(
[
prompt
for
_
in
size_factors
],
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
[],
[],
...
@@ -367,7 +377,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
...
@@ -367,7 +377,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
num_logprobs
:
int
)
->
None
:
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
:
L
ist
[
T
uple
[
L
ist
[
str
],
PromptImageInput
,
inputs_per_case
:
l
ist
[
t
uple
[
l
ist
[
str
],
PromptImageInput
,
PromptVideoInput
]]
=
[(
PromptVideoInput
]]
=
[(
[
MULTIIMAGE_PROMPT
for
_
in
size_factors
],
[
MULTIIMAGE_PROMPT
for
_
in
size_factors
],
[[
[[
...
@@ -415,8 +425,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
...
@@ -415,8 +425,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
for
asset
in
video_assets
for
asset
in
video_assets
]
]
inputs_per_case
:
L
ist
[
T
uple
[
inputs_per_case
:
l
ist
[
t
uple
[
L
ist
[
str
],
PromptImageInput
,
PromptVideoInput
]]
=
[(
l
ist
[
str
],
PromptImageInput
,
PromptVideoInput
]]
=
[(
[
prompt
for
_
in
size_factors
],
[
prompt
for
_
in
size_factors
],
[],
[],
[
rescale_video_size
(
video
,
factor
)
for
factor
in
size_factors
],
[
rescale_video_size
(
video
,
factor
)
for
factor
in
size_factors
],
...
...
tests/models/decoder_only/vision_language/vlm_utils/builders.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""Helpers for building inputs that can be leveraged for different test types.
"""Helpers for building inputs that can be leveraged for different test types.
"""
"""
from
collections.abc
import
Iterable
from
pathlib
import
PosixPath
from
pathlib
import
PosixPath
from
typing
import
Callable
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Callable
,
Optional
,
Union
import
torch
import
torch
...
@@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
...
@@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
def
get_model_prompts
(
base_prompts
:
Iterable
[
str
],
def
get_model_prompts
(
base_prompts
:
Iterable
[
str
],
img_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
img_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
video_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
video_idx_to_prompt
:
Optional
[
Callable
[[
int
],
str
]],
prompt_formatter
:
Callable
[[
str
],
str
])
->
L
ist
[
str
]:
prompt_formatter
:
Callable
[[
str
],
str
])
->
l
ist
[
str
]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
to get the test prompt string for this model.
...
@@ -218,7 +219,7 @@ def build_video_inputs_from_test_info(
...
@@ -218,7 +219,7 @@ def build_video_inputs_from_test_info(
)
for
video
,
prompt
in
zip
(
sampled_vids
,
model_prompts
)]
)
for
video
,
prompt
in
zip
(
sampled_vids
,
model_prompts
)]
def
apply_image_size_scaling
(
image
,
size
:
Union
[
float
,
T
uple
[
int
,
int
]],
def
apply_image_size_scaling
(
image
,
size
:
Union
[
float
,
t
uple
[
int
,
int
]],
size_type
:
SizeType
):
size_type
:
SizeType
):
"""Applies a size scaler to one image; this can be a an image size factor,
"""Applies a size scaler to one image; this can be a an image size factor,
which scales the image while maintaining the aspect ratio"""
which scales the image while maintaining the aspect ratio"""
...
...
tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
View file @
469e903b
...
@@ -5,7 +5,7 @@ handling multimodal placeholder substitution, and so on.
...
@@ -5,7 +5,7 @@ handling multimodal placeholder substitution, and so on.
"""
"""
import
itertools
import
itertools
from
collections
import
OrderedDict
from
collections
import
OrderedDict
from
typing
import
Dict
,
Iterable
,
Tuple
from
collections.abc
import
Iterable
import
pytest
import
pytest
...
@@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
...
@@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
ImageSizeWrapper
,
SizeType
,
VLMTestInfo
,
VLMTestType
)
ImageSizeWrapper
,
SizeType
,
VLMTestInfo
,
VLMTestType
)
def
get_filtered_test_settings
(
test_settings
:
Dict
[
str
,
VLMTestInfo
],
def
get_filtered_test_settings
(
test_type
:
VLMTestType
,
test_settings
:
dict
[
str
,
VLMTestInfo
],
test_type
:
VLMTestType
,
fork
_per_test
:
bool
)
->
D
ict
[
str
,
VLMTestInfo
]:
new_proc
_per_test
:
bool
)
->
d
ict
[
str
,
VLMTestInfo
]:
"""Given the dict of potential test settings to run, return a subdict
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
of tests who have the current test type enabled with the matching val for
fork_per_test.
fork_per_test.
...
@@ -43,22 +43,22 @@ def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
...
@@ -43,22 +43,22 @@ def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
# Everything looks okay; keep if this is has correct proc handling
# Everything looks okay; keep if this is has correct proc handling
if
(
test_info
.
distributed_executor_backend
if
(
test_info
.
distributed_executor_backend
is
not
None
)
==
fork
_per_test
:
is
not
None
)
==
new_proc
_per_test
:
matching_tests
[
test_name
]
=
test_info
matching_tests
[
test_name
]
=
test_info
return
matching_tests
return
matching_tests
def
get_parametrized_options
(
test_settings
:
D
ict
[
str
,
VLMTestInfo
],
def
get_parametrized_options
(
test_settings
:
d
ict
[
str
,
VLMTestInfo
],
test_type
:
VLMTestType
,
test_type
:
VLMTestType
,
fork
_new_process_for_each_test
:
bool
):
create
_new_process_for_each_test
:
bool
):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
size factors etc, while still running in isolated test cases.
"""
"""
matching_tests
=
get_filtered_test_settings
(
matching_tests
=
get_filtered_test_settings
(
test_settings
,
test_type
,
fork
_new_process_for_each_test
)
test_settings
,
test_type
,
create
_new_process_for_each_test
)
# Ensure that something is wrapped as an iterable it's not already
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped
=
lambda
e
:
e
if
isinstance
(
e
,
(
list
,
tuple
))
else
(
e
,
)
ensure_wrapped
=
lambda
e
:
e
if
isinstance
(
e
,
(
list
,
tuple
))
else
(
e
,
)
...
@@ -121,7 +121,7 @@ def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
...
@@ -121,7 +121,7 @@ def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
def
get_wrapped_test_sizes
(
def
get_wrapped_test_sizes
(
test_info
:
VLMTestInfo
,
test_info
:
VLMTestInfo
,
test_type
:
VLMTestType
)
->
T
uple
[
ImageSizeWrapper
,
...]:
test_type
:
VLMTestType
)
->
t
uple
[
ImageSizeWrapper
,
...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
and combine them into an iterable, each of which will be used in parameter
expansion.
expansion.
...
...
tests/models/decoder_only/vision_language/vlm_utils/core.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""Core test implementation to be shared across modalities."""
"""Core test implementation to be shared across modalities."""
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
Union
from
typing
import
Any
,
Callable
,
Optional
,
Union
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
transformers
import
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.config
import
TaskOption
...
@@ -17,9 +16,9 @@ from .types import RunnerOutput
...
@@ -17,9 +16,9 @@ from .types import RunnerOutput
def
run_test
(
def
run_test
(
*
,
*
,
hf_runner
:
T
ype
[
HfRunner
],
hf_runner
:
t
ype
[
HfRunner
],
vllm_runner
:
T
ype
[
VllmRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
inputs
:
L
ist
[
T
uple
[
L
ist
[
str
],
L
ist
[
Union
[
L
ist
[
Image
],
Image
]]]],
inputs
:
l
ist
[
t
uple
[
l
ist
[
str
],
l
ist
[
Union
[
l
ist
[
Image
],
Image
]]]],
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
...
@@ -29,15 +28,14 @@ def run_test(
...
@@ -29,15 +28,14 @@ def run_test(
max_num_seqs
:
int
,
max_num_seqs
:
int
,
hf_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
hf_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
vllm_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
vllm_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
auto_cls
:
T
ype
[
_BaseAutoModelClass
],
auto_cls
:
t
ype
[
_BaseAutoModelClass
],
use_tokenizer_eos
:
bool
,
use_tokenizer_eos
:
bool
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
],
comparator
:
Callable
[...,
None
],
comparator
:
Callable
[...,
None
],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]],
stop_str
:
Optional
[
L
ist
[
str
]],
stop_str
:
Optional
[
l
ist
[
str
]],
limit_mm_per_prompt
:
D
ict
[
str
,
int
],
limit_mm_per_prompt
:
d
ict
[
str
,
int
],
vllm_runner_kwargs
:
Optional
[
D
ict
[
str
,
Any
]],
vllm_runner_kwargs
:
Optional
[
d
ict
[
str
,
Any
]],
hf_model_kwargs
:
Optional
[
D
ict
[
str
,
Any
]],
hf_model_kwargs
:
Optional
[
d
ict
[
str
,
Any
]],
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]],
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]],
task
:
TaskOption
=
"auto"
,
task
:
TaskOption
=
"auto"
,
runner_mm_key
:
str
=
"images"
,
runner_mm_key
:
str
=
"images"
,
...
@@ -61,7 +59,9 @@ def run_test(
...
@@ -61,7 +59,9 @@ def run_test(
# if we run HF first, the cuda initialization will be done and it
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# will hurt multiprocessing backend with fork method (the default method).
vllm_runner_kwargs_
:
Dict
[
str
,
Any
]
=
{}
vllm_runner_kwargs_
:
dict
[
str
,
Any
]
=
{
"disable_mm_preprocessor_cache"
:
True
,
}
if
model_info
.
tokenizer
:
if
model_info
.
tokenizer
:
vllm_runner_kwargs_
[
"tokenizer"
]
=
model_info
.
tokenizer
vllm_runner_kwargs_
[
"tokenizer"
]
=
model_info
.
tokenizer
if
model_info
.
tokenizer_mode
:
if
model_info
.
tokenizer_mode
:
...
@@ -84,7 +84,7 @@ def run_test(
...
@@ -84,7 +84,7 @@ def run_test(
**
vllm_runner_kwargs_
)
as
vllm_model
:
**
vllm_runner_kwargs_
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
vllm_kwargs
:
D
ict
[
str
,
Any
]
=
{}
vllm_kwargs
:
d
ict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
if
get_stop_token_ids
is
not
None
:
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
if
stop_str
:
if
stop_str
:
...
@@ -99,7 +99,6 @@ def run_test(
...
@@ -99,7 +99,6 @@ def run_test(
hf_model
=
hf_runner
(
model
,
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
auto_cls
=
auto_cls
,
auto_cls
=
auto_cls
,
postprocess_inputs
=
postprocess_inputs
,
model_kwargs
=
hf_model_kwargs
)
model_kwargs
=
hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
# Some models need to patch things like the model processor, e.g., internvl
...
...
tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""Custom input builders for edge-cases in different models."""
"""Custom input builders for edge-cases in different models."""
from
io
import
BytesIO
from
typing
import
Callable
from
typing
import
Callable
import
requests
from
PIL
import
Image
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.video
import
(
rescale_video_size
,
resize_video
,
from
vllm.multimodal.video
import
(
rescale_video_size
,
resize_video
,
sample_frames_from_video
)
sample_frames_from_video
)
...
@@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
...
@@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
build_single_image_inputs
(
images
,
formatted_sprompts
,
wrapped_sf
),
build_single_image_inputs
(
images
,
formatted_sprompts
,
wrapped_sf
),
build_multi_image_inputs
([
images
],
formatted_mprompts
,
wrapped_sf
),
build_multi_image_inputs
([
images
],
formatted_mprompts
,
wrapped_sf
),
]
]
def
windows_attention_image_qwen2_5_vl
():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
image_url
=
"https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
image
=
Image
.
open
(
BytesIO
(
requests
.
get
(
image_url
).
content
))
question
=
"Describe the image."
img_prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
prompt
=
(
f
"<|im_start|>User
\n
{
img_prompt
}{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
wrapped_sf
=
ImageSizeWrapper
(
type
=
SizeType
.
SIZE_FACTOR
,
data
=
[
0.5
])
return
build_single_image_inputs
([
image
],
[
prompt
],
wrapped_sf
)
Prev
1
…
19
20
21
22
23
24
25
26
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment