Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8d75f22e
Commit
8d75f22e
authored
Dec 13, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori
parents
ce888aa4
7d80c73d
Changes
656
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
295 additions
and
483 deletions
+295
-483
tests/models/multimodal/generation/test_granite_speech.py
tests/models/multimodal/generation/test_granite_speech.py
+13
-2
tests/models/multimodal/generation/test_phi4_multimodal.py
tests/models/multimodal/generation/test_phi4_multimodal.py
+0
-281
tests/models/multimodal/generation/test_pixtral.py
tests/models/multimodal/generation/test_pixtral.py
+10
-0
tests/models/multimodal/generation/vlm_utils/case_filtering.py
.../models/multimodal/generation/vlm_utils/case_filtering.py
+60
-54
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
...s/models/multimodal/generation/vlm_utils/custom_inputs.py
+1
-1
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+44
-1
tests/models/multimodal/generation/vlm_utils/types.py
tests/models/multimodal/generation/vlm_utils/types.py
+2
-2
tests/models/multimodal/pooling/conftest.py
tests/models/multimodal/pooling/conftest.py
+24
-0
tests/models/multimodal/pooling/test_siglip.py
tests/models/multimodal/pooling/test_siglip.py
+16
-2
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+2
-24
tests/models/multimodal/processing/test_glm4_1v.py
tests/models/multimodal/processing/test_glm4_1v.py
+4
-3
tests/models/multimodal/processing/test_tensor_schema.py
tests/models/multimodal/processing/test_tensor_schema.py
+2
-3
tests/models/quantization/test_gguf.py
tests/models/quantization/test_gguf.py
+7
-0
tests/models/registry.py
tests/models/registry.py
+16
-9
tests/models/test_gguf_download.py
tests/models/test_gguf_download.py
+1
-1
tests/models/test_registry.py
tests/models/test_registry.py
+0
-2
tests/multimodal/test_cache.py
tests/multimodal/test_cache.py
+4
-7
tests/multimodal/test_inputs.py
tests/multimodal/test_inputs.py
+0
-91
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
..._dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+1
-0
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+88
-0
No files found.
Too many changes to show.
To preserve performance only
656 of 656+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/generation/test_granite_speech.py
View file @
8d75f22e
...
@@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq
...
@@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq
from
vllm.logprobs
import
SampleLogprobs
from
vllm.logprobs
import
SampleLogprobs
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.platforms
import
current_platform
from
....conftest
import
AudioTestAssets
,
HfRunner
,
PromptAudioInput
,
VllmRunner
from
....conftest
import
AudioTestAssets
,
HfRunner
,
PromptAudioInput
,
VllmRunner
from
...registry
import
HF_EXAMPLE_MODELS
from
...registry
import
HF_EXAMPLE_MODELS
...
@@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
...
@@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
models
=
[
MODEL_NAME
]
models
=
[
MODEL_NAME
]
@
pytest
.
fixture
(
autouse
=
True
)
def
set_attention_backend_for_rocm
(
monkeypatch
):
if
current_platform
.
is_rocm
():
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
def
run_test
(
def
run_test
(
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
...
@@ -111,8 +118,12 @@ def run_test(
...
@@ -111,8 +118,12 @@ def run_test(
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
2048
])
"dtype"
,
[
"float16"
]
if
current_platform
.
is_rocm
()
else
[
"bfloat16"
]
)
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
512
]
if
current_platform
.
is_rocm
()
else
[
2048
]
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_models
(
def
test_models
(
...
...
tests/models/multimodal/generation/test_phi4_multimodal.py
deleted
100644 → 0
View file @
ce888aa4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
collections.abc
import
Sequence
import
librosa
import
pytest
from
huggingface_hub
import
snapshot_download
from
vllm.assets.image
import
ImageAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.image
import
rescale_image_size
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptAudioInput
,
PromptImageInput
,
VllmRunner
,
)
from
....utils
import
large_gpu_test
from
...utils
import
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
(
{
"stop_sign"
:
"<|user|>
\n
<|image|>
\n
What's the content of the image?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
"cherry_blossom"
:
"<|user|>
\n
<|image|>
\n
Please infer the season with reason in details.<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT
=
(
"<|user|>
\n
<|image|>
\n
<|image|>
\n
Describe these images.<|end|>
\n
<|assistant|>
\n
"
# noqa: E501
)
model_path
=
snapshot_download
(
"microsoft/Phi-4-multimodal-instruct"
,
revision
=
"refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
speech_question
=
os
.
path
.
join
(
model_path
,
"examples"
,
"what_is_shown_in_this_image.wav"
)
models
=
[
model_path
]
target_dtype
=
"half"
def
run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
Sequence
[
tuple
[
list
[
str
],
PromptImageInput
,
PromptAudioInput
|
None
]],
model
:
str
,
*
,
max_model_len
:
int
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
str
|
None
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
task
=
"generate"
,
max_model_len
=
max_model_len
,
max_num_seqs
=
2
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enable_lora
=
True
,
max_lora_rank
=
320
,
gpu_memory_utilization
=
0.8
,
# set to 0.8 to avoid OOM in CI
enforce_eager
=
True
,
trust_remote_code
=
False
,
)
as
vllm_model
:
lora_request
=
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
audios
=
audios
,
lora_request
=
lora_request
,
)
for
prompts
,
images
,
audios
in
inputs
]
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_model
.
model
.
load_adapter
(
vision_lora_path
,
adapter_name
=
"vision"
,
)
hf_processor
=
hf_model
.
processor
eos_token_id
=
hf_processor
.
tokenizer
.
eos_token_id
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
audios
=
audios
,
eos_token_id
=
eos_token_id
,
)
for
prompts
,
images
,
audios
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
12800
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[
(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
None
,
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
# [],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
25600
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
(
[
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[
[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
],
None
,
),
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_case
,
model
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
12800
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_vision_speech_models
(
hf_runner
,
vllm_runner
,
model
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
# use the example speech question so that the model outputs are reasonable
audio
=
librosa
.
load
(
speech_question
,
sr
=
16000
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
inputs_vision_speech
=
[
(
[
"<|user|><|image|><|audio|><|end|><|assistant|>"
],
[
image
],
[
audio
],
),
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_vision_speech
,
model
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
tests/models/multimodal/generation/test_pixtral.py
View file @
8d75f22e
...
@@ -15,6 +15,7 @@ from transformers import AutoProcessor
...
@@ -15,6 +15,7 @@ from transformers import AutoProcessor
from
vllm
import
SamplingParams
,
TextPrompt
,
TokensPrompt
from
vllm
import
SamplingParams
,
TextPrompt
,
TokensPrompt
from
vllm.logprobs
import
Logprob
,
SampleLogprobs
from
vllm.logprobs
import
Logprob
,
SampleLogprobs
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.platforms
import
current_platform
from
....utils
import
VLLM_PATH
,
large_gpu_test
from
....utils
import
VLLM_PATH
,
large_gpu_test
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
...
@@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
...
@@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
def
test_chat
(
def
test_chat
(
vllm_runner
,
max_model_len
:
int
,
model
:
str
,
dtype
:
str
,
local_asset_server
vllm_runner
,
max_model_len
:
int
,
model
:
str
,
dtype
:
str
,
local_asset_server
)
->
None
:
)
->
None
:
if
(
model
==
MISTRAL_SMALL_3_1_ID
and
max_model_len
==
65536
and
current_platform
.
is_rocm
()
):
pytest
.
skip
(
"OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
)
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
[
model
])
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
[
model
])
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
...
...
tests/models/multimodal/generation/vlm_utils/case_filtering.py
View file @
8d75f22e
...
@@ -62,6 +62,65 @@ def get_filtered_test_settings(
...
@@ -62,6 +62,65 @@ def get_filtered_test_settings(
return
matching_tests
return
matching_tests
def
get_model_type_cases
(
model_type
:
str
,
test_info
:
VLMTestInfo
,
test_type
:
VLMTestType
,
):
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped
=
lambda
e
:
e
if
isinstance
(
e
,
(
list
,
tuple
))
else
(
e
,)
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs
=
OrderedDict
(
[
(
"model"
,
ensure_wrapped
(
test_info
.
models
)),
(
"max_tokens"
,
ensure_wrapped
(
test_info
.
max_tokens
)),
(
"num_logprobs"
,
ensure_wrapped
(
test_info
.
num_logprobs
)),
(
"dtype"
,
ensure_wrapped
(
test_info
.
dtype
)),
(
"distributed_executor_backend"
,
ensure_wrapped
(
test_info
.
distributed_executor_backend
),
),
]
)
# num_frames is video only
if
test_type
==
VLMTestType
.
VIDEO
:
iter_kwargs
[
"num_video_frames"
]
=
ensure_wrapped
(
test_info
.
num_video_frames
)
iter_kwargs
[
"needs_video_metadata"
]
=
ensure_wrapped
(
test_info
.
needs_video_metadata
)
# No sizes passed for custom inputs, since inputs are directly provided
if
test_type
not
in
(
VLMTestType
.
CUSTOM_INPUTS
,
VLMTestType
.
AUDIO
,
):
wrapped_sizes
=
get_wrapped_test_sizes
(
test_info
,
test_type
)
if
wrapped_sizes
is
None
:
raise
ValueError
(
f
"Sizes must be set for test type
{
test_type
}
"
)
iter_kwargs
[
"size_wrapper"
]
=
wrapped_sizes
# Otherwise expand the custom test options instead
elif
test_type
==
VLMTestType
.
CUSTOM_INPUTS
:
if
test_info
.
custom_test_opts
is
None
:
raise
ValueError
(
"Test has type CUSTOM_INPUTS, but none given"
)
iter_kwargs
[
"custom_test_opts"
]
=
test_info
.
custom_test_opts
# Wrap all model cases in a pytest parameter & pass marks through
return
[
pytest
.
param
(
model_type
,
ExpandableVLMTestArgs
(
**
{
k
:
v
for
k
,
v
in
zip
(
iter_kwargs
.
keys
(),
case
)}),
marks
=
test_info
.
marks
if
test_info
.
marks
is
not
None
else
[],
)
for
case
in
list
(
itertools
.
product
(
*
iter_kwargs
.
values
()))
]
def
get_parametrized_options
(
def
get_parametrized_options
(
test_settings
:
dict
[
str
,
VLMTestInfo
],
test_settings
:
dict
[
str
,
VLMTestInfo
],
test_type
:
VLMTestType
,
test_type
:
VLMTestType
,
...
@@ -76,64 +135,11 @@ def get_parametrized_options(
...
@@ -76,64 +135,11 @@ def get_parametrized_options(
test_settings
,
test_type
,
create_new_process_for_each_test
test_settings
,
test_type
,
create_new_process_for_each_test
)
)
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped
=
lambda
e
:
e
if
isinstance
(
e
,
(
list
,
tuple
))
else
(
e
,)
def
get_model_type_cases
(
model_type
:
str
,
test_info
:
VLMTestInfo
):
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs
=
OrderedDict
(
[
(
"model"
,
ensure_wrapped
(
test_info
.
models
)),
(
"max_tokens"
,
ensure_wrapped
(
test_info
.
max_tokens
)),
(
"num_logprobs"
,
ensure_wrapped
(
test_info
.
num_logprobs
)),
(
"dtype"
,
ensure_wrapped
(
test_info
.
dtype
)),
(
"distributed_executor_backend"
,
ensure_wrapped
(
test_info
.
distributed_executor_backend
),
),
]
)
# num_frames is video only
if
test_type
==
VLMTestType
.
VIDEO
:
iter_kwargs
[
"num_video_frames"
]
=
ensure_wrapped
(
test_info
.
num_video_frames
)
iter_kwargs
[
"needs_video_metadata"
]
=
ensure_wrapped
(
test_info
.
needs_video_metadata
)
# No sizes passed for custom inputs, since inputs are directly provided
if
test_type
not
in
(
VLMTestType
.
CUSTOM_INPUTS
,
VLMTestType
.
AUDIO
):
wrapped_sizes
=
get_wrapped_test_sizes
(
test_info
,
test_type
)
if
wrapped_sizes
is
None
:
raise
ValueError
(
f
"Sizes must be set for test type
{
test_type
}
"
)
iter_kwargs
[
"size_wrapper"
]
=
wrapped_sizes
# Otherwise expand the custom test options instead
elif
test_type
==
VLMTestType
.
CUSTOM_INPUTS
:
if
test_info
.
custom_test_opts
is
None
:
raise
ValueError
(
"Test has type CUSTOM_INPUTS, but none given"
)
iter_kwargs
[
"custom_test_opts"
]
=
test_info
.
custom_test_opts
# Wrap all model cases in a pytest parameter & pass marks through
return
[
pytest
.
param
(
model_type
,
ExpandableVLMTestArgs
(
**
{
k
:
v
for
k
,
v
in
zip
(
iter_kwargs
.
keys
(),
case
)}
),
marks
=
test_info
.
marks
if
test_info
.
marks
is
not
None
else
[],
)
for
case
in
list
(
itertools
.
product
(
*
iter_kwargs
.
values
()))
]
# Get a list per model type, where each entry contains a tuple of all of
# Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that
# that model type's cases, then flatten them into the top level so that
# we can consume them in one mark.parametrize call.
# we can consume them in one mark.parametrize call.
cases_by_model_type
=
[
cases_by_model_type
=
[
get_model_type_cases
(
model_type
,
test_info
)
get_model_type_cases
(
model_type
,
test_info
,
test_type
)
for
model_type
,
test_info
in
matching_tests
.
items
()
for
model_type
,
test_info
in
matching_tests
.
items
()
]
]
return
list
(
itertools
.
chain
(
*
cases_by_model_type
))
return
list
(
itertools
.
chain
(
*
cases_by_model_type
))
...
...
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
View file @
8d75f22e
...
@@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
...
@@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
metadata
=
VIDEO_ASSETS
[
0
].
metadata
metadata
=
VIDEO_ASSETS
[
0
].
metadata
question
=
"Describe the video."
question
=
"Describe the video."
video_prompt
=
"<|begin_of_video|><|video|><|end_of_video|>"
video_prompt
=
"<|begin_of_video|><|video|><|end_of_video|>"
formatted_prompt
=
f
"<|user|>
\n
{
video_prompt
}{
question
}
<|assistant|>
\n
"
formatted_prompt
=
f
"
[gMASK]
<|user|>
\n
{
video_prompt
}{
question
}
<|assistant|>
\n
"
scales
=
[
0.1
,
0.2
,
0.25
]
scales
=
[
0.1
,
0.2
,
0.25
]
video_input
=
[
video_input
=
[
...
...
tests/models/multimodal/generation/vlm_utils/model_utils.py
View file @
8d75f22e
...
@@ -25,6 +25,7 @@ from transformers import (
...
@@ -25,6 +25,7 @@ from transformers import (
from
transformers.video_utils
import
VideoMetadata
from
transformers.video_utils
import
VideoMetadata
from
vllm.logprobs
import
SampleLogprobs
from
vllm.logprobs
import
SampleLogprobs
from
vllm.platforms
import
current_platform
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.utils.collection_utils
import
is_list_of
from
.....conftest
import
HfRunner
,
ImageAsset
,
ImageTestAssets
from
.....conftest
import
HfRunner
,
ImageAsset
,
ImageTestAssets
...
@@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut
...
@@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut
def
glm4v_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
glm4v_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
if
current_platform
.
is_rocm
():
import
types
config
=
hf_model
.
model
.
config
if
hasattr
(
config
,
"num_layers"
)
and
not
hasattr
(
config
,
"num_hidden_layers"
):
config
.
num_hidden_layers
=
config
.
num_layers
config
.
output_hidden_states
=
True
def
patched_prepare_cache
(
self
,
generation_config
,
model_kwargs
,
*
args
,
**
kwargs
):
model_kwargs
[
"past_key_values"
]
=
None
model_kwargs
[
"use_cache"
]
=
False
return
model_kwargs
hf_model
.
model
.
_prepare_cache_for_generation
=
types
.
MethodType
(
patched_prepare_cache
,
hf_model
.
model
)
original_generate
=
hf_model
.
model
.
generate
def
patched_generate
(
*
args
,
**
kwargs
):
kwargs
[
"output_hidden_states"
]
=
True
kwargs
[
"return_dict_in_generate"
]
=
True
return
original_generate
(
*
args
,
**
kwargs
)
hf_model
.
model
.
generate
=
patched_generate
original_forward
=
hf_model
.
model
.
forward
def
patched_forward
(
*
args
,
**
kwargs
):
kwargs
[
"output_hidden_states"
]
=
True
return
original_forward
(
*
args
,
**
kwargs
)
hf_model
.
model
.
forward
=
patched_forward
hf_processor
=
hf_model
.
processor
hf_processor
=
hf_model
.
processor
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
**
kwargs
):
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
**
kwargs
):
...
@@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
if
videos
is
not
None
and
is_list_of
(
videos
,
tuple
):
if
videos
is
not
None
and
is_list_of
(
videos
,
tuple
):
# If videos is a list of tuples, we assume each tuple contains
# If videos is a list of tuples, we assume each tuple contains
# (video_array, metadata) as in the case of GLM4.1V.
# (video_array, metadata) as in the case of GLM4.1V.
video_metadata
=
[[
VideoMetadata
(
**
video
[
1
])]
for
video
in
videos
]
# Filter out 'do_sample_frames' as it's not a valid VideoMetadata arg
video_metadata
=
[
[
VideoMetadata
(
**
{
k
:
v
for
k
,
v
in
video
[
1
].
items
()
if
k
!=
"do_sample_frames"
}
)
]
for
video
in
videos
]
videos
=
[[
video
[
0
]]
for
video
in
videos
]
videos
=
[[
video
[
0
]]
for
video
in
videos
]
else
:
else
:
video_metadata
=
None
video_metadata
=
None
...
...
tests/models/multimodal/generation/vlm_utils/types.py
View file @
8d75f22e
...
@@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL
...
@@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL
VIDEO_BASE_PROMPT
=
f
"
{
TEST_VIDEO_PLACEHOLDER
}
Why is this video funny?"
VIDEO_BASE_PROMPT
=
f
"
{
TEST_VIDEO_PLACEHOLDER
}
Why is this video funny?"
IMAGE_SIZE_FACTORS
=
[
(),
(
1.0
,),
(
1.0
,
1.0
,
1.0
),
(
0.25
,
0.5
,
1.0
)]
IMAGE_SIZE_FACTORS
=
[(
1.0
,),
(
1.0
,
1.0
,
1.0
),
(
0.25
,
0.5
,
1.0
)]
EMBEDDING_SIZE_FACTORS
=
[
(),
(
1.0
,),
(
1.0
,
1.0
,
1.0
)]
EMBEDDING_SIZE_FACTORS
=
[(
1.0
,),
(
1.0
,
1.0
,
1.0
)]
RunnerOutput
=
tuple
[
list
[
int
],
str
,
SampleLogprobs
|
None
]
RunnerOutput
=
tuple
[
list
[
int
],
str
,
SampleLogprobs
|
None
]
...
...
tests/models/multimodal/pooling/conftest.py
0 → 100644
View file @
8d75f22e
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import
os
import
warnings
from
vllm.platforms
import
current_platform
def
pytest_collection_modifyitems
(
config
,
items
):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if
not
current_platform
.
is_rocm
():
return
siglip_tests
=
[
item
for
item
in
items
if
"test_siglip"
in
item
.
nodeid
]
if
siglip_tests
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"FLEX_ATTENTION"
warnings
.
warn
(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests"
,
UserWarning
,
stacklevel
=
1
,
)
tests/models/multimodal/pooling/test_siglip.py
View file @
8d75f22e
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
import
pytest
import
pytest
from
transformers
import
SiglipModel
from
transformers
import
SiglipModel
...
@@ -35,7 +37,11 @@ def _run_test(
...
@@ -35,7 +37,11 @@ def _run_test(
model
:
str
,
model
:
str
,
*
,
*
,
dtype
:
str
,
dtype
:
str
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
None
:
)
->
None
:
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
{}
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
runner
=
"pooling"
,
runner
=
"pooling"
,
...
@@ -44,10 +50,14 @@ def _run_test(
...
@@ -44,10 +50,14 @@ def _run_test(
max_model_len
=
64
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
)
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
,
tokenization_kwargs
=
tokenization_kwargs
)
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
SiglipModel
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
SiglipModel
)
as
hf_model
:
all_inputs
=
hf_model
.
get_inputs
(
input_texts
,
images
=
input_images
)
all_inputs
=
hf_model
.
get_inputs
(
input_texts
,
images
=
input_images
,
tokenization_kwargs
=
tokenization_kwargs
)
all_outputs
=
[]
all_outputs
=
[]
for
inputs
in
all_inputs
:
for
inputs
in
all_inputs
:
...
@@ -94,6 +104,10 @@ def test_models_text(
...
@@ -94,6 +104,10 @@ def test_models_text(
input_images
,
# type: ignore
input_images
,
# type: ignore
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
tokenization_kwargs
=
{
"padding"
:
"max_length"
,
"max_length"
:
64
,
},
# siglip2 was trained with this padding setting.
)
)
...
...
tests/models/multimodal/processing/test_common.py
View file @
8d75f22e
...
@@ -20,7 +20,7 @@ from vllm.config.multimodal import (
...
@@ -20,7 +20,7 @@ from vllm.config.multimodal import (
)
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalDataDict
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalDataDict
from
vllm.multimodal.cache
import
MultiModalProcessorOnlyCache
from
vllm.multimodal.cache
import
MultiModalProcessorOnlyCache
from
vllm.multimodal.inputs
import
MultiModalInputs
from
vllm.multimodal.inputs
import
MultiModalInputs
,
batched_tensors_equal
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
,
InputProcessingContext
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
,
InputProcessingContext
from
vllm.tokenizers
import
(
from
vllm.tokenizers
import
(
MistralTokenizer
,
MistralTokenizer
,
...
@@ -396,28 +396,6 @@ def test_processing_correctness(
...
@@ -396,28 +396,6 @@ def test_processing_correctness(
)
)
# Phi4MultimodalForCausalLM share same model repo with original format
# Phi4MMForCausalLM, so we add it as a separate test case
# Remove this test after conversion PR merged:
# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
[
"Phi4MultimodalForCausalLM"
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"simplify_rate"
,
[
1.0
])
def
test_processing_correctness_phi4_multimodal
(
model_arch
:
str
,
hit_rate
:
float
,
num_batches
:
int
,
simplify_rate
:
float
,
):
_test_processing_correctness
(
model_arch
,
hit_rate
=
hit_rate
,
num_batches
=
num_batches
,
simplify_rate
=
simplify_rate
,
)
def
_assert_inputs_equal
(
def
_assert_inputs_equal
(
a
:
MultiModalInputs
,
a
:
MultiModalInputs
,
b
:
MultiModalInputs
,
b
:
MultiModalInputs
,
...
@@ -440,4 +418,4 @@ def _assert_inputs_equal(
...
@@ -440,4 +418,4 @@ def _assert_inputs_equal(
a_data
.
pop
(
key
,
None
)
a_data
.
pop
(
key
,
None
)
b_data
.
pop
(
key
,
None
)
b_data
.
pop
(
key
,
None
)
assert
a_data
==
b_data
,
msg
assert
batched_tensors_equal
(
a_data
,
b_data
)
,
msg
tests/models/multimodal/processing/test_glm4_1v.py
View file @
8d75f22e
...
@@ -5,6 +5,7 @@ import pytest
...
@@ -5,6 +5,7 @@ import pytest
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
batched_tensors_equal
from
vllm.multimodal.video
import
OpenCVDynamicVideoBackend
,
OpenCVVideoBackend
from
vllm.multimodal.video
import
OpenCVDynamicVideoBackend
,
OpenCVVideoBackend
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -103,7 +104,7 @@ def test_video_loader_consistency(
...
@@ -103,7 +104,7 @@ def test_video_loader_consistency(
dynamic_outputs
=
processor
.
apply
(
prompt
,
dynamic_mm_data
,
hf_processor_mm_kwargs
)
dynamic_outputs
=
processor
.
apply
(
prompt
,
dynamic_mm_data
,
hf_processor_mm_kwargs
)
assert
static_outputs
[
"prompt_token_ids"
]
==
dynamic_outputs
[
"prompt_token_ids"
]
assert
static_outputs
[
"prompt_token_ids"
]
==
dynamic_outputs
[
"prompt_token_ids"
]
assert
(
assert
batched_tensors_equal
(
static_outputs
[
"mm_kwargs"
].
get_data
()
static_outputs
[
"mm_kwargs"
].
get_data
()
,
==
dynamic_outputs
[
"mm_kwargs"
].
get_data
()
dynamic_outputs
[
"mm_kwargs"
].
get_data
()
,
)
)
tests/models/multimodal/processing/test_tensor_schema.py
View file @
8d75f22e
...
@@ -130,10 +130,9 @@ def create_batched_mm_kwargs(
...
@@ -130,10 +130,9 @@ def create_batched_mm_kwargs(
hf_processor_mm_kwargs
=
processor_inputs
.
hf_processor_mm_kwargs
,
hf_processor_mm_kwargs
=
processor_inputs
.
hf_processor_mm_kwargs
,
tokenization_kwargs
=
processor_inputs
.
tokenization_kwargs
,
tokenization_kwargs
=
processor_inputs
.
tokenization_kwargs
,
)[
"mm_kwargs"
].
require_data
()
)[
"mm_kwargs"
].
require_data
()
items
=
[
item
for
modality
in
supported_mm_limits
for
item
in
mm_kwargs
[
modality
]]
return
group_mm_kwargs_by_modality
(
return
group_mm_kwargs_by_modality
(
items
,
[
item
for
modality
in
supported_mm_limits
for
item
in
mm_kwargs
[
modality
]]
merge_by_field_config
=
model_cls
.
merge_by_field_config
,
)
)
...
...
tests/models/quantization/test_gguf.py
View file @
8d75f22e
...
@@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
...
@@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
gguf_filename
=
"qwen2.5-1.5b-instruct-q6_k.gguf"
,
gguf_filename
=
"qwen2.5-1.5b-instruct-q6_k.gguf"
,
)
)
QWEN3_CONFIG
=
GGUFTestConfig
(
original_model
=
"Qwen/Qwen3-0.6B"
,
gguf_repo
=
"unsloth/Qwen3-0.6B-GGUF"
,
gguf_filename
=
"Qwen3-0.6B-BF16.gguf"
,
)
PHI3_CONFIG
=
GGUFTestConfig
(
PHI3_CONFIG
=
GGUFTestConfig
(
original_model
=
"microsoft/Phi-3.5-mini-instruct"
,
original_model
=
"microsoft/Phi-3.5-mini-instruct"
,
gguf_repo
=
"bartowski/Phi-3.5-mini-instruct-GGUF"
,
gguf_repo
=
"bartowski/Phi-3.5-mini-instruct-GGUF"
,
...
@@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
...
@@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
MODELS
=
[
MODELS
=
[
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
QWEN2_CONFIG
,
QWEN2_CONFIG
,
QWEN3_CONFIG
,
PHI3_CONFIG
,
PHI3_CONFIG
,
GPT2_CONFIG
,
GPT2_CONFIG
,
STABLELM_CONFIG
,
STABLELM_CONFIG
,
...
...
tests/models/registry.py
View file @
8d75f22e
...
@@ -211,10 +211,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -211,10 +211,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
),
),
"CohereForCausalLM"
:
_HfExamplesInfo
(
"CohereForCausalLM"
:
_HfExamplesInfo
(
"Cohere
ForAI
/c4ai-command-r-v01"
,
trust_remote_code
=
True
"Cohere
Labs
/c4ai-command-r-v01"
,
trust_remote_code
=
True
),
),
"Cohere2ForCausalLM"
:
_HfExamplesInfo
(
"Cohere2ForCausalLM"
:
_HfExamplesInfo
(
"Cohere
ForAI
/c4ai-command-r7b-12-2024"
,
"Cohere
Labs
/c4ai-command-r7b-12-2024"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
),
),
"CwmForCausalLM"
:
_HfExamplesInfo
(
"facebook/cwm"
,
min_transformers_version
=
"4.58"
),
"CwmForCausalLM"
:
_HfExamplesInfo
(
"facebook/cwm"
,
min_transformers_version
=
"4.58"
),
...
@@ -416,7 +416,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -416,7 +416,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
),
),
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen2-0.5B-Instruct"
,
extras
=
{
"2.5"
:
"Qwen/Qwen2.5-0.5B-Instruct"
}
"Qwen/Qwen2-0.5B-Instruct"
,
extras
=
{
"2.5"
:
"Qwen/Qwen2.5-0.5B-Instruct"
,
"2.5-1.5B"
:
"Qwen/Qwen2.5-1.5B-Instruct"
,
},
),
),
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
),
...
@@ -577,7 +581,7 @@ _AUTOMATIC_CONVERTED_MODELS = {
...
@@ -577,7 +581,7 @@ _AUTOMATIC_CONVERTED_MODELS = {
_MULTIMODAL_EXAMPLE_MODELS
=
{
_MULTIMODAL_EXAMPLE_MODELS
=
{
# [Decoder-only]
# [Decoder-only]
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"Cohere
ForAI
/aya-vision-8b"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"Cohere
Labs
/aya-vision-8b"
),
"BeeForConditionalGeneration"
:
_HfExamplesInfo
(
"BeeForConditionalGeneration"
:
_HfExamplesInfo
(
"Open-Bee/Bee-8B-RL"
,
"Open-Bee/Bee-8B-RL"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -667,6 +671,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -667,6 +671,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"moonshotai/Kimi-VL-A3B-Instruct"
,
"moonshotai/Kimi-VL-A3B-Instruct"
,
extras
=
{
"thinking"
:
"moonshotai/Kimi-VL-A3B-Thinking"
},
extras
=
{
"thinking"
:
"moonshotai/Kimi-VL-A3B-Thinking"
},
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.53.3"
,
transformers_version_reason
=
"HF model uses deprecated transformers API "
"(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
,
),
),
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
"lightonai/LightOnOCR-1B"
,
"lightonai/LightOnOCR-1B"
,
...
@@ -767,10 +775,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -767,10 +775,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
trust_remote_code
=
True
"microsoft/Phi-4-multimodal-instruct"
,
trust_remote_code
=
True
),
),
"Phi4MultimodalForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
revision
=
"refs/pr/70"
,
),
"PixtralForConditionalGeneration"
:
_HfExamplesInfo
(
"PixtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Pixtral-12B-2409"
,
"mistralai/Pixtral-12B-2409"
,
extras
=
{
extras
=
{
...
@@ -831,7 +835,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -831,7 +835,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"TarsierForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier-7b"
),
"TarsierForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier-7b"
),
"Tarsier2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Tarsier2ForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier2-Recap-7b"
,
"omni-research/Tarsier2-Recap-7b"
,
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
]},
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
],
"model_type"
:
"tarsier2"
,
},
),
),
"VoxtralForConditionalGeneration"
:
_HfExamplesInfo
(
"VoxtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Voxtral-Mini-3B-2507"
,
"mistralai/Voxtral-Mini-3B-2507"
,
...
...
tests/models/test_gguf_download.py
View file @
8d75f22e
...
@@ -203,7 +203,7 @@ class TestGGUFModelLoader:
...
@@ -203,7 +203,7 @@ class TestGGUFModelLoader:
@
patch
(
"vllm.config.model.get_hf_image_processor_config"
,
return_value
=
None
)
@
patch
(
"vllm.config.model.get_hf_image_processor_config"
,
return_value
=
None
)
@
patch
(
"vllm.config.model.get_config"
)
@
patch
(
"vllm.config.model.get_config"
)
@
patch
(
"vllm.config.model.is_gguf"
,
return_value
=
False
)
@
patch
(
"vllm.config.model.is_gguf"
,
return_value
=
False
)
@
patch
(
"vllm.transformers_utils.utils.check_gguf_file"
,
return_value
=
False
)
@
patch
(
"vllm.transformers_utils.
gguf_
utils.check_gguf_file"
,
return_value
=
False
)
@
patch
(
"os.path.isfile"
,
return_value
=
False
)
@
patch
(
"os.path.isfile"
,
return_value
=
False
)
def
test_prepare_weights_invalid_format
(
def
test_prepare_weights_invalid_format
(
self
,
self
,
...
...
tests/models/test_registry.py
View file @
8d75f22e
...
@@ -13,7 +13,6 @@ from vllm.model_executor.models import (
...
@@ -13,7 +13,6 @@ from vllm.model_executor.models import (
)
)
from
vllm.model_executor.models.adapters
import
(
from
vllm.model_executor.models.adapters
import
(
as_embedding_model
,
as_embedding_model
,
as_reward_model
,
as_seq_cls_model
,
as_seq_cls_model
,
)
)
from
vllm.model_executor.models.registry
import
(
from
vllm.model_executor.models.registry
import
(
...
@@ -46,7 +45,6 @@ def test_registry_imports(model_arch):
...
@@ -46,7 +45,6 @@ def test_registry_imports(model_arch):
# All vLLM models should be convertible to a pooling model
# All vLLM models should be convertible to a pooling model
assert
is_pooling_model
(
as_seq_cls_model
(
model_cls
))
assert
is_pooling_model
(
as_seq_cls_model
(
model_cls
))
assert
is_pooling_model
(
as_embedding_model
(
model_cls
))
assert
is_pooling_model
(
as_embedding_model
(
model_cls
))
assert
is_pooling_model
(
as_reward_model
(
model_cls
))
if
model_arch
in
_MULTIMODAL_MODELS
:
if
model_arch
in
_MULTIMODAL_MODELS
:
assert
supports_multimodal
(
model_cls
)
assert
supports_multimodal
(
model_cls
)
...
...
tests/multimodal/test_cache.py
View file @
8d75f22e
...
@@ -51,7 +51,7 @@ def _dummy_elem(
...
@@ -51,7 +51,7 @@ def _dummy_elem(
modality
=
modality
,
modality
=
modality
,
key
=
key
,
key
=
key
,
data
=
data
,
data
=
data
,
field
=
MultiModalSharedField
(
1
),
field
=
MultiModalSharedField
(
batch_size
=
1
),
)
)
...
@@ -85,12 +85,6 @@ def _dummy_items(
...
@@ -85,12 +85,6 @@ def _dummy_items(
(
_dummy_item
(
"a"
,
{
"a1"
:
100
}),
100
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
}),
100
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
,
"a2"
:
110
}),
210
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
,
"a2"
:
110
}),
210
),
(
_dummy_items
({
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}),
460
),
# noqa: E501
(
_dummy_items
({
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}),
460
),
# noqa: E501
(
_dummy_items
(
{
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}
).
get_data
(),
460
,
),
# noqa: E501
],
],
)
)
def
test_cache_item_size
(
item
,
expected_size
):
def
test_cache_item_size
(
item
,
expected_size
):
...
@@ -107,6 +101,9 @@ def test_cache_item_size(item, expected_size):
...
@@ -107,6 +101,9 @@ def test_cache_item_size(item, expected_size):
cache
[
""
]
=
MultiModalProcessorCacheItemMetadata
(
item
,
[
prompt_update
])
cache
[
""
]
=
MultiModalProcessorCacheItemMetadata
(
item
,
[
prompt_update
])
assert
cache
.
currsize
==
expected_size
assert
cache
.
currsize
==
expected_size
cache
[
""
]
=
item
.
get_data
()
assert
cache
.
currsize
==
expected_size
def
_create_vllm_config
(
def
_create_vllm_config
(
*
,
*
,
...
...
tests/multimodal/test_inputs.py
deleted
100644 → 0
View file @
ce888aa4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
vllm.multimodal.inputs
import
MultiModalKwargs
,
NestedTensors
pytestmark
=
pytest
.
mark
.
cpu_test
def
assert_nested_tensors_equal
(
expected
:
NestedTensors
,
actual
:
NestedTensors
):
assert
type
(
expected
)
==
type
(
actual
)
# noqa: E721
if
isinstance
(
expected
,
torch
.
Tensor
):
assert
torch
.
equal
(
expected
,
actual
)
else
:
for
expected_item
,
actual_item
in
zip
(
expected
,
actual
):
assert_nested_tensors_equal
(
expected_item
,
actual_item
)
def
assert_multimodal_inputs_equal
(
expected
:
MultiModalKwargs
,
actual
:
MultiModalKwargs
):
assert
set
(
expected
.
keys
())
==
set
(
actual
.
keys
())
for
key
in
expected
:
assert_nested_tensors_equal
(
expected
[
key
],
actual
[
key
])
def
test_multimodal_input_batch_single_tensor
():
t
=
torch
.
rand
([
1
,
2
])
result
=
MultiModalKwargs
.
batch
([{
"image"
:
t
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
t
.
unsqueeze
(
0
)})
def
test_multimodal_input_batch_multiple_tensors
():
a
=
torch
.
rand
([
1
,
1
,
2
])
b
=
torch
.
rand
([
1
,
1
,
2
])
c
=
torch
.
rand
([
1
,
1
,
2
])
result
=
MultiModalKwargs
.
batch
([{
"image"
:
a
},
{
"image"
:
b
},
{
"image"
:
c
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
torch
.
stack
([
a
,
b
,
c
])})
def
test_multimodal_input_batch_multiple_heterogeneous_tensors
():
a
=
torch
.
rand
([
1
,
2
,
2
])
b
=
torch
.
rand
([
1
,
3
,
2
])
c
=
torch
.
rand
([
1
,
4
,
2
])
result
=
MultiModalKwargs
.
batch
([{
"image"
:
a
},
{
"image"
:
b
},
{
"image"
:
c
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
a
,
b
,
c
]})
def
test_multimodal_input_batch_nested_tensors
():
a
=
torch
.
rand
([
2
,
3
])
b
=
torch
.
rand
([
2
,
3
])
c
=
torch
.
rand
([
2
,
3
])
result
=
MultiModalKwargs
.
batch
([{
"image"
:
[
a
]},
{
"image"
:
[
b
]},
{
"image"
:
[
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
torch
.
stack
([
a
.
unsqueeze
(
0
),
b
.
unsqueeze
(
0
),
c
.
unsqueeze
(
0
)])}
)
def
test_multimodal_input_batch_heterogeneous_lists
():
a
=
torch
.
rand
([
1
,
2
,
3
])
b
=
torch
.
rand
([
1
,
2
,
3
])
c
=
torch
.
rand
([
1
,
2
,
3
])
result
=
MultiModalKwargs
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
torch
.
stack
([
a
,
b
]),
c
.
unsqueeze
(
0
)]}
)
def
test_multimodal_input_batch_multiple_batchable_lists
():
a
=
torch
.
rand
([
1
,
2
,
3
])
b
=
torch
.
rand
([
1
,
2
,
3
])
c
=
torch
.
rand
([
1
,
2
,
3
])
d
=
torch
.
rand
([
1
,
2
,
3
])
result
=
MultiModalKwargs
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
,
d
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
torch
.
stack
([
torch
.
stack
([
a
,
b
]),
torch
.
stack
([
c
,
d
])])}
)
def
test_multimodal_input_batch_mixed_stacking_depths
():
a
=
torch
.
rand
([
1
,
2
,
3
])
b
=
torch
.
rand
([
1
,
3
,
3
])
c
=
torch
.
rand
([
1
,
4
,
3
])
result
=
MultiModalKwargs
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[[
a
,
b
],
c
.
unsqueeze
(
0
)]})
result
=
MultiModalKwargs
.
batch
([{
"image"
:
[
a
]},
{
"image"
:
[
b
,
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
a
.
unsqueeze
(
0
),
[
b
,
c
]]})
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
View file @
8d75f22e
...
@@ -30,5 +30,6 @@ class DummyPlatform(Platform):
...
@@ -30,5 +30,6 @@ class DummyPlatform(Platform):
use_mla
,
use_mla
,
has_sink
,
has_sink
,
use_sparse
,
use_sparse
,
use_mm_prefix
,
):
):
return
"vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"
# noqa E501
return
"vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"
# noqa E501
tests/quantization/test_fp8.py
View file @
8d75f22e
...
@@ -10,10 +10,14 @@ import torch
...
@@ -10,10 +10,14 @@ import torch
from
tests.quantization.utils
import
is_quant_method_supported
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.quantization.fp8
import
(
from
vllm.model_executor.layers.quantization.fp8
import
(
Fp8Config
,
Fp8KVCacheMethod
,
Fp8KVCacheMethod
,
Fp8LinearMethod
,
Fp8LinearMethod
,
Fp8MoEMethod
,
)
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
MODELS
=
[
MODELS
=
[
...
@@ -261,3 +265,87 @@ def test_scaled_fp8_quant(dtype) -> None:
...
@@ -261,3 +265,87 @@ def test_scaled_fp8_quant(dtype) -> None:
torch
.
narrow
(
y_nc_pad
,
0
,
0
,
x_nc
.
shape
[
0
]),
inv_scale_nc
,
dtype
torch
.
narrow
(
y_nc_pad
,
0
,
0
,
x_nc
.
shape
[
0
]),
inv_scale_nc
,
dtype
),
),
)
)
@
pytest
.
mark
.
parametrize
(
"method_cls"
,
[
Fp8LinearMethod
,
Fp8MoEMethod
])
# FP8 weight reloading does not support online quantization
@
pytest
.
mark
.
parametrize
(
"is_checkpoint_fp8_serialized"
,
[
True
])
# skip False
@
pytest
.
mark
.
parametrize
(
"weight_block_size"
,
[
None
,
[
1
,
1
]])
# any postprocessing that is applied to the weights such as padding and repacking
# (excluding device sharding) must also be applied to the reloaded weights
#
# this is the case for marlin as well as per-tensor Fp8MoEMethod
@
pytest
.
mark
.
parametrize
(
"use_marlin"
,
[
False
])
# skip True
def
test_fp8_reloading
(
method_cls
,
is_checkpoint_fp8_serialized
,
weight_block_size
,
use_marlin
,
dist_init
):
if
is_checkpoint_fp8_serialized
is
False
:
pytest
.
skip
(
"FP8 weight reloading does not support online quantization"
)
if
method_cls
is
Fp8MoEMethod
and
weight_block_size
is
None
:
pytest
.
skip
(
"FP8 Tensor weight reloading does not support fusing w13_weight_scale. "
"If this is your use case, consider using a restore function like #26327"
)
with
torch
.
device
(
"cuda:0"
):
config
=
Fp8Config
(
is_checkpoint_fp8_serialized
=
is_checkpoint_fp8_serialized
,
weight_block_size
=
weight_block_size
,
)
if
method_cls
is
Fp8LinearMethod
:
layer
=
torch
.
nn
.
Linear
(
1
,
1
)
method
=
method_cls
(
config
)
method
.
create_weights
(
layer
=
layer
,
input_size_per_partition
=
1
,
output_partition_sizes
=
[
1
],
input_size
=
1
,
output_size
=
1
,
params_dtype
=
torch
.
bfloat16
,
weight_loader
=
default_weight_loader
,
)
else
:
layer
=
FusedMoE
(
num_experts
=
1
,
top_k
=
1
,
hidden_size
=
1
,
intermediate_size
=
1
,
)
method
=
method_cls
(
config
,
layer
)
method
.
create_weights
(
layer
=
layer
,
num_experts
=
1
,
hidden_size
=
1
,
intermediate_size_per_partition
=
1
,
params_dtype
=
torch
.
bfloat16
,
weight_loader
=
default_weight_loader
,
)
method
.
use_marlin
=
use_marlin
# capture weights format during loading
original_metadata
=
[
(
name
,
param
.
shape
,
getattr
(
param
,
"weight_loader"
,
default_weight_loader
))
for
name
,
param
in
layer
.
named_parameters
()
]
# test loading
for
name
,
shape
,
_
in
original_metadata
:
param
=
getattr
(
layer
,
name
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
torch
.
zeros
(
shape
))
# cannot use empty
method
.
process_weights_after_loading
(
layer
)
# test reloading works after loading
# assuming that no reshaping occurred
for
name
,
shape
,
original_weight_loader
in
original_metadata
:
param
=
getattr
(
layer
,
name
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
assert
weight_loader
is
original_weight_loader
weight_loader
(
param
,
torch
.
zeros
(
shape
))
# cannot use empty
method
.
process_weights_after_loading
(
layer
)
Prev
1
…
10
11
12
13
14
15
16
17
18
…
33
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment