Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
99324e25
Commit
99324e25
authored
Jul 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.2' into v0.9.2-ori
parents
cc7f22a8
a5dd03c1
Changes
475
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
287 additions
and
54 deletions
+287
-54
tests/models/multimodal/pooling/test_llava_next.py
tests/models/multimodal/pooling/test_llava_next.py
+1
-1
tests/models/multimodal/pooling/test_phi3v.py
tests/models/multimodal/pooling/test_phi3v.py
+1
-1
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+25
-0
tests/models/multimodal/test_mapping.py
tests/models/multimodal/test_mapping.py
+86
-0
tests/models/quantization/test_gguf.py
tests/models/quantization/test_gguf.py
+2
-2
tests/models/registry.py
tests/models/registry.py
+48
-24
tests/models/test_initialization.py
tests/models/test_initialization.py
+14
-3
tests/models/test_oot_registration.py
tests/models/test_oot_registration.py
+3
-1
tests/models/test_registry.py
tests/models/test_registry.py
+4
-4
tests/models/utils.py
tests/models/utils.py
+7
-0
tests/mq_llm_engine/test_error_handling.py
tests/mq_llm_engine/test_error_handling.py
+5
-5
tests/multi_step/test_correctness_llm.py
tests/multi_step/test_correctness_llm.py
+10
-0
tests/multimodal/test_hasher.py
tests/multimodal/test_hasher.py
+12
-0
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+1
-0
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+7
-4
tests/multimodal/test_video.py
tests/multimodal/test_video.py
+47
-1
tests/neuron/1_core/test_prefix_prefill.py
tests/neuron/1_core/test_prefix_prefill.py
+4
-5
tests/plugins/vllm_add_dummy_platform/setup.py
tests/plugins/vllm_add_dummy_platform/setup.py
+3
-1
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
...lm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+4
-0
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
...atform/vllm_add_dummy_platform/dummy_attention_backend.py
+3
-2
No files found.
Too many changes to show.
To preserve performance only
475 of 475+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/pooling/test_llava_next.py
View file @
99324e25
...
@@ -68,7 +68,7 @@ def _run_test(
...
@@ -68,7 +68,7 @@ def _run_test(
dtype
=
dtype
,
dtype
=
dtype
,
max_model_len
=
4096
,
max_model_len
=
4096
,
enforce_eager
=
True
)
as
vllm_model
:
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
e
ncode
(
input_texts
,
images
=
input_images
)
vllm_outputs
=
vllm_model
.
e
mbed
(
input_texts
,
images
=
input_images
)
with
hf_runner
(
model
,
dtype
=
dtype
,
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForImageTextToText
)
as
hf_model
:
auto_cls
=
AutoModelForImageTextToText
)
as
hf_model
:
...
...
tests/models/multimodal/pooling/test_phi3v.py
View file @
99324e25
...
@@ -46,7 +46,7 @@ def _run_test(
...
@@ -46,7 +46,7 @@ def _run_test(
# will hurt multiprocessing backend with fork method (the default method).
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
task
=
"embed"
,
dtype
=
dtype
,
with
vllm_runner
(
model
,
task
=
"embed"
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
e
ncode
(
input_texts
,
images
=
input_images
)
vllm_outputs
=
vllm_model
.
e
mbed
(
input_texts
,
images
=
input_images
)
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs
=
{
"_attn_implementation"
:
"eager"
}
hf_model_kwargs
=
{
"_attn_implementation"
:
"eager"
}
...
...
tests/models/multimodal/processing/test_common.py
View file @
99324e25
...
@@ -24,6 +24,22 @@ from ....multimodal.utils import random_audio, random_image, random_video
...
@@ -24,6 +24,22 @@ from ....multimodal.utils import random_audio, random_image, random_video
from
...registry
import
HF_EXAMPLE_MODELS
from
...registry
import
HF_EXAMPLE_MODELS
def
glm4_1v_patch_mm_data
(
mm_data
:
MultiModalDataDict
)
->
MultiModalDataDict
:
"""
Patch the multimodal data for GLM4.1V model.
"""
# Ensure video metadata is included
if
"video"
in
mm_data
:
video
=
mm_data
[
"video"
]
mm_data
[
"video"
]
=
(
video
,
{
"total_num_frames"
:
len
(
video
),
"fps"
:
len
(
video
),
"duration"
:
1
,
"video_backend"
:
"opencv"
})
return
mm_data
def
_test_processing_correctness
(
def
_test_processing_correctness
(
model_id
:
str
,
model_id
:
str
,
hit_rate
:
float
,
hit_rate
:
float
,
...
@@ -154,6 +170,11 @@ _IGNORE_MM_KEYS = {
...
@@ -154,6 +170,11 @@ _IGNORE_MM_KEYS = {
"ultravox"
:
{
"audio_features"
},
"ultravox"
:
{
"audio_features"
},
}
}
MM_DATA_PATCHES
=
{
# GLM4.1V requires video metadata to be included in the input
"glm4v"
:
glm4_1v_patch_mm_data
,
}
def
_test_processing_correctness_one
(
def
_test_processing_correctness_one
(
model_config
:
ModelConfig
,
model_config
:
ModelConfig
,
...
@@ -166,6 +187,8 @@ def _test_processing_correctness_one(
...
@@ -166,6 +187,8 @@ def _test_processing_correctness_one(
):
):
model_type
=
model_config
.
hf_config
.
model_type
model_type
=
model_config
.
hf_config
.
model_type
ignore_mm_keys
=
_IGNORE_MM_KEYS
.
get
(
model_type
,
set
[
str
]())
ignore_mm_keys
=
_IGNORE_MM_KEYS
.
get
(
model_type
,
set
[
str
]())
if
model_type
in
MM_DATA_PATCHES
:
mm_data
=
MM_DATA_PATCHES
[
model_type
](
mm_data
)
if
isinstance
(
prompt
,
str
):
if
isinstance
(
prompt
,
str
):
text_prompt
=
prompt
text_prompt
=
prompt
...
@@ -245,6 +268,7 @@ def _test_processing_correctness_one(
...
@@ -245,6 +268,7 @@ def _test_processing_correctness_one(
"adept/fuyu-8b"
,
"adept/fuyu-8b"
,
"google/gemma-3-4b-it"
,
"google/gemma-3-4b-it"
,
"THUDM/glm-4v-9b"
,
"THUDM/glm-4v-9b"
,
"THUDM/GLM-4.1V-9B-Thinking"
,
"ibm-granite/granite-speech-3.3-2b"
,
"ibm-granite/granite-speech-3.3-2b"
,
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-800m"
,
"OpenGVLab/InternVL2-1B"
,
"OpenGVLab/InternVL2-1B"
,
...
@@ -284,6 +308,7 @@ def _test_processing_correctness_one(
...
@@ -284,6 +308,7 @@ def _test_processing_correctness_one(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"openai/whisper-large-v3"
,
"openai/whisper-large-v3"
,
"omni-research/Tarsier-7b"
,
"omni-research/Tarsier-7b"
,
"omni-research/Tarsier2-Recap-7b"
])
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
...
...
tests/models/multimodal/test_mapping.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Iterable
import
pytest
import
torch
import
transformers
from
transformers
import
AutoConfig
,
PreTrainedModel
from
vllm.config
import
ModelConfig
from
vllm.model_executor.models.utils
import
WeightsMapper
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.transformers_utils.config
import
try_get_safetensors_metadata
from
..registry
import
_MULTIMODAL_EXAMPLE_MODELS
,
HF_EXAMPLE_MODELS
def
create_repo_dummy_weights
(
repo
:
str
)
->
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]:
"""Create weights from safetensors checkpoint metadata"""
metadata
=
try_get_safetensors_metadata
(
repo
)
weight_names
=
list
(
metadata
.
weight_map
.
keys
())
with
torch
.
device
(
'meta'
):
return
((
name
,
torch
.
empty
(
0
))
for
name
in
weight_names
)
def
create_model_dummy_weights
(
repo
:
str
,
model_arch
:
str
,
)
->
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]:
"""
Create weights from a dummy meta deserialized hf model with name conversion
"""
model_cls
:
PreTrainedModel
=
getattr
(
transformers
,
model_arch
)
config
=
AutoConfig
.
from_pretrained
(
repo
)
with
torch
.
device
(
"meta"
):
model
:
PreTrainedModel
=
model_cls
.
_from_config
(
config
)
return
model
.
named_parameters
()
def
model_architectures_for_test
()
->
list
[
str
]:
arch_to_test
=
list
[
str
]()
for
model_arch
,
info
in
_MULTIMODAL_EXAMPLE_MODELS
.
items
():
if
not
info
.
trust_remote_code
and
hasattr
(
transformers
,
model_arch
):
model_cls
:
PreTrainedModel
=
getattr
(
transformers
,
model_arch
)
if
getattr
(
model_cls
,
"_checkpoint_conversion_mapping"
,
None
):
arch_to_test
.
append
(
model_arch
)
return
arch_to_test
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
model_architectures_for_test
())
def
test_hf_model_weights_mapper
(
model_arch
:
str
):
model_info
=
HF_EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_id
=
model_info
.
default
model_config
=
ModelConfig
(
model_id
,
task
=
"auto"
,
tokenizer
=
model_info
.
tokenizer
or
model_id
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
trust_remote_code
=
model_info
.
trust_remote_code
,
seed
=
0
,
dtype
=
"auto"
,
revision
=
None
,
hf_overrides
=
model_info
.
hf_overrides
,
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
original_weights
=
create_repo_dummy_weights
(
model_id
)
hf_converted_weights
=
create_model_dummy_weights
(
model_id
,
model_arch
)
mapper
:
WeightsMapper
=
model_cls
.
hf_to_vllm_mapper
mapped_original_weights
=
mapper
.
apply
(
original_weights
)
mapped_hf_converted_weights
=
mapper
.
apply
(
hf_converted_weights
)
ref_weight_names
=
set
(
map
(
lambda
x
:
x
[
0
],
mapped_original_weights
))
weight_names
=
set
(
map
(
lambda
x
:
x
[
0
],
mapped_hf_converted_weights
))
weights_missing
=
ref_weight_names
-
weight_names
weights_unmapped
=
weight_names
-
ref_weight_names
assert
(
not
weights_missing
and
not
weights_unmapped
),
(
f
"Following weights are not mapped correctly:
{
weights_unmapped
}
, "
f
"Missing expected weights:
{
weights_missing
}
."
)
tests/models/quantization/test_gguf.py
View file @
99324e25
...
@@ -79,11 +79,11 @@ DOLPHIN_CONFIG = GGUFTestConfig(
...
@@ -79,11 +79,11 @@ DOLPHIN_CONFIG = GGUFTestConfig(
)
)
MODELS
=
[
MODELS
=
[
LLAMA_CONFIG
,
#
LLAMA_CONFIG,
# broken: https://github.com/vllm-project/vllm/issues/19458
QWEN2_CONFIG
,
QWEN2_CONFIG
,
PHI3_CONFIG
,
PHI3_CONFIG
,
GPT2_CONFIG
,
GPT2_CONFIG
,
#
STABLELM_CONFIG,
# enable this when v1 support head_size=80
STABLELM_CONFIG
,
DOLPHIN_CONFIG
,
DOLPHIN_CONFIG
,
# STARCODER_CONFIG, # broken
# STARCODER_CONFIG, # broken
]
]
...
...
tests/models/registry.py
View file @
99324e25
...
@@ -70,6 +70,12 @@ class _HfExamplesInfo:
...
@@ -70,6 +70,12 @@ class _HfExamplesInfo:
length that is too large to fit into memory in CI.
length that is too large to fit into memory in CI.
"""
"""
revision
:
Optional
[
str
]
=
None
"""
The specific revision (commit hash, tag, or branch) to use for the model.
If not specified, the default revision will be used.
"""
def
check_transformers_version
(
def
check_transformers_version
(
self
,
self
,
*
,
*
,
...
@@ -156,14 +162,20 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -156,14 +162,20 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"DeepseekV3ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/DeepSeek-V3"
,
# noqa: E501
"DeepseekV3ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/DeepSeek-V3"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Ernie4_5_ForCausalLM"
:
_HfExamplesInfo
(
"baidu/ERNIE-4.5-0.3B-PT"
,
trust_remote_code
=
True
),
"Ernie4_5_MoeForCausalLM"
:
_HfExamplesInfo
(
"baidu/ERNIE-4.5-21B-A3B-PT"
,
trust_remote_code
=
True
),
"ExaoneForCausalLM"
:
_HfExamplesInfo
(
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
),
# noqa: E501
"ExaoneForCausalLM"
:
_HfExamplesInfo
(
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
),
# noqa: E501
"Fairseq2LlamaForCausalLM"
:
_HfExamplesInfo
(
"mgleize/fairseq2-dummy-Llama-3.2-1B"
),
# noqa: E501
"Fairseq2LlamaForCausalLM"
:
_HfExamplesInfo
(
"mgleize/fairseq2-dummy-Llama-3.2-1B"
),
# noqa: E501
"FalconForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-7b"
),
"FalconForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-7b"
),
"FalconH1ForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/Falcon-H1-
1
.5B-
Instruct
"
,
"FalconH1ForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/Falcon-H1-
0
.5B-
Base
"
,
min_transformers_version
=
"4.53"
),
min_transformers_version
=
"4.53"
),
"GemmaForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-1.1-2b-it"
),
"GemmaForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-1.1-2b-it"
),
"Gemma2ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-2-9b"
),
"Gemma2ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-2-9b"
),
"Gemma3ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-3-1b-it"
),
"Gemma3ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-3-1b-it"
),
"Gemma3nForConditionalGeneration"
:
_HfExamplesInfo
(
"google/gemma-3n-E2B-it"
,
# noqa: E501
min_transformers_version
=
"4.53"
),
"GlmForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4-9b-chat-hf"
),
"GlmForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4-9b-chat-hf"
),
"Glm4ForCausalLM"
:
_HfExamplesInfo
(
"THUDM/GLM-4-9B-0414"
),
"Glm4ForCausalLM"
:
_HfExamplesInfo
(
"THUDM/GLM-4-9B-0414"
),
"GPT2LMHeadModel"
:
_HfExamplesInfo
(
"openai-community/gpt2"
,
"GPT2LMHeadModel"
:
_HfExamplesInfo
(
"openai-community/gpt2"
,
...
@@ -180,6 +192,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -180,6 +192,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"GraniteMoeSharedForCausalLM"
:
_HfExamplesInfo
(
"ibm-research/moe-7b-1b-active-shared-experts"
),
# noqa: E501
"GraniteMoeSharedForCausalLM"
:
_HfExamplesInfo
(
"ibm-research/moe-7b-1b-active-shared-experts"
),
# noqa: E501
"Grok1ModelForCausalLM"
:
_HfExamplesInfo
(
"hpcai-tech/grok-1"
,
"Grok1ModelForCausalLM"
:
_HfExamplesInfo
(
"hpcai-tech/grok-1"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"HunYuanMoEV1ForCausalLM"
:
_HfExamplesInfo
(
"tencent/Hunyuan-A13B-Instruct"
,
trust_remote_code
=
True
),
"InternLMForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm-chat-7b"
,
"InternLMForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm-chat-7b"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"InternLM2ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm2-chat-7b"
,
"InternLM2ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm2-chat-7b"
,
...
@@ -193,7 +207,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -193,7 +207,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
extras
=
{
"tiny"
:
"ai21labs/Jamba-tiny-dev"
}),
# noqa: E501
extras
=
{
"tiny"
:
"ai21labs/Jamba-tiny-dev"
}),
# noqa: E501
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-1B-Instruct"
,
extras
=
{
"guard"
:
"meta-llama/Llama-Guard-3-1B"
,
# noqa: E501
extras
=
{
"guard"
:
"meta-llama/Llama-Guard-3-1B"
,
# noqa: E501
"hermes"
:
"NousResearch/Hermes-3-Llama-3.1-8B"
}),
# noqa: E501
"hermes"
:
"NousResearch/Hermes-3-Llama-3.1-8B"
,
# noqa: E501
"fp8"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
}),
# noqa: E501
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
"decapoda-research/llama-7b-hf"
,
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
"decapoda-research/llama-7b-hf"
,
is_available_online
=
False
),
is_available_online
=
False
),
"MambaForCausalLM"
:
_HfExamplesInfo
(
"state-spaces/mamba-130m-hf"
),
"MambaForCausalLM"
:
_HfExamplesInfo
(
"state-spaces/mamba-130m-hf"
),
...
@@ -204,7 +219,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -204,7 +219,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"MiniCPM3ForCausalLM"
:
_HfExamplesInfo
(
"openbmb/MiniCPM3-4B"
,
"MiniCPM3ForCausalLM"
:
_HfExamplesInfo
(
"openbmb/MiniCPM3-4B"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"MiniMaxText01ForCausalLM"
:
_HfExamplesInfo
(
"MiniMaxAI/MiniMax-Text-01"
,
"MiniMaxText01ForCausalLM"
:
_HfExamplesInfo
(
"MiniMaxAI/MiniMax-Text-01"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
,
revision
=
"a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"
),
# noqa: E501
"MiniMaxM1ForCausalLM"
:
_HfExamplesInfo
(
"MiniMaxAI/MiniMax-M1-40k"
,
trust_remote_code
=
True
),
"MistralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-7B-Instruct-v0.1"
),
"MistralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-7B-Instruct-v0.1"
),
"MixtralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
# noqa: E501
"MixtralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
# noqa: E501
{
"tiny"
:
"TitanML/tiny-mixtral"
}),
# noqa: E501
{
"tiny"
:
"TitanML/tiny-mixtral"
}),
# noqa: E501
...
@@ -222,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -222,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"OrionForCausalLM"
:
_HfExamplesInfo
(
"OrionStarAI/Orion-14B-Chat"
,
"OrionForCausalLM"
:
_HfExamplesInfo
(
"OrionStarAI/Orion-14B-Chat"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"PersimmonForCausalLM"
:
_HfExamplesInfo
(
"adept/persimmon-8b-chat"
),
"PersimmonForCausalLM"
:
_HfExamplesInfo
(
"adept/persimmon-8b-chat"
),
"PhiForCausalLM"
:
_HfExamplesInfo
(
"microsoft/phi-2"
,
v0_only
=
True
),
"PhiForCausalLM"
:
_HfExamplesInfo
(
"microsoft/phi-2"
),
"Phi3ForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-mini-4k-instruct"
),
"Phi3ForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-mini-4k-instruct"
),
# Blocksparse attention not supported in V1 yet
"Phi3SmallForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-small-8k-instruct"
,
"Phi3SmallForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-small-8k-instruct"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
v0_only
=
True
),
v0_only
=
True
),
...
@@ -238,11 +257,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -238,11 +257,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-30B-A3B"
),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-30B-A3B"
),
"Qwen3ForSequenceClassification"
:
_HfExamplesInfo
(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
),
# noqa: E501
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
),
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
),
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
,
# noqa: E501
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
),
# noqa: E501
v0_only
=
True
),
"StableLmForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-3b-4e1t"
),
"StableLmForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-3b-4e1t"
,
v0_only
=
True
),
"Starcoder2ForCausalLM"
:
_HfExamplesInfo
(
"bigcode/starcoder2-3b"
),
"Starcoder2ForCausalLM"
:
_HfExamplesInfo
(
"bigcode/starcoder2-3b"
),
"SolarForCausalLM"
:
_HfExamplesInfo
(
"upstage/solar-pro-preview-instruct"
),
"SolarForCausalLM"
:
_HfExamplesInfo
(
"upstage/solar-pro-preview-instruct"
),
"TeleChat2ForCausalLM"
:
_HfExamplesInfo
(
"Tele-AI/TeleChat2-3B"
,
"TeleChat2ForCausalLM"
:
_HfExamplesInfo
(
"Tele-AI/TeleChat2-3B"
,
...
@@ -255,6 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -255,6 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Zamba2ForCausalLM"
:
_HfExamplesInfo
(
"Zyphra/Zamba2-7B-instruct"
),
"Zamba2ForCausalLM"
:
_HfExamplesInfo
(
"Zyphra/Zamba2-7B-instruct"
),
"MiMoForCausalLM"
:
_HfExamplesInfo
(
"XiaomiMiMo/MiMo-7B-RL"
,
"MiMoForCausalLM"
:
_HfExamplesInfo
(
"XiaomiMiMo/MiMo-7B-RL"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Dots1ForCausalLM"
:
_HfExamplesInfo
(
"rednote-hilab/dots.llm1.inst"
,
min_transformers_version
=
"4.53"
),
# [Encoder-decoder]
# [Encoder-decoder]
"BartModel"
:
_HfExamplesInfo
(
"facebook/bart-base"
),
"BartModel"
:
_HfExamplesInfo
(
"facebook/bart-base"
),
"BartForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/bart-large-cnn"
),
"BartForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/bart-large-cnn"
),
...
@@ -262,8 +282,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -262,8 +282,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
_EMBEDDING_EXAMPLE_MODELS
=
{
_EMBEDDING_EXAMPLE_MODELS
=
{
# [Text-only]
# [Text-only]
"BertModel"
:
_HfExamplesInfo
(
"BAAI/bge-base-en-v1.5"
),
"BertModel"
:
_HfExamplesInfo
(
"BAAI/bge-base-en-v1.5"
,
v0_only
=
True
),
"Gemma2Model"
:
_HfExamplesInfo
(
"BAAI/bge-multilingual-gemma2"
),
"Gemma2Model"
:
_HfExamplesInfo
(
"BAAI/bge-multilingual-gemma2"
,
v0_only
=
True
),
# noqa: E501
"GPT2ForSequenceClassification"
:
_HfExamplesInfo
(
"nie3e/sentiment-polish-gpt2-small"
),
# noqa: E501
"GritLM"
:
_HfExamplesInfo
(
"parasail-ai/GritLM-7B-vllm"
),
"GritLM"
:
_HfExamplesInfo
(
"parasail-ai/GritLM-7B-vllm"
),
"GteModel"
:
_HfExamplesInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
"GteModel"
:
_HfExamplesInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
...
@@ -276,16 +297,16 @@ _EMBEDDING_EXAMPLE_MODELS = {
...
@@ -276,16 +297,16 @@ _EMBEDDING_EXAMPLE_MODELS = {
"LlamaModel"
:
_HfExamplesInfo
(
"llama"
,
is_available_online
=
False
),
"LlamaModel"
:
_HfExamplesInfo
(
"llama"
,
is_available_online
=
False
),
"MistralModel"
:
_HfExamplesInfo
(
"intfloat/e5-mistral-7b-instruct"
),
"MistralModel"
:
_HfExamplesInfo
(
"intfloat/e5-mistral-7b-instruct"
),
"ModernBertModel"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
"ModernBertModel"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
,
v0_only
=
True
),
"NomicBertModel"
:
_HfExamplesInfo
(
"nomic-ai/nomic-embed-text-v2-moe"
,
"NomicBertModel"
:
_HfExamplesInfo
(
"nomic-ai/nomic-embed-text-v2-moe"
,
trust_remote_code
=
True
)
,
trust_remote_code
=
True
,
v0_only
=
True
),
# noqa: E501
"Qwen2Model"
:
_HfExamplesInfo
(
"ssmits/Qwen2-7B-Instruct-embed-base"
),
"Qwen2Model"
:
_HfExamplesInfo
(
"ssmits/Qwen2-7B-Instruct-embed-base"
),
"Qwen2ForRewardModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Math-RM-72B"
),
"Qwen2ForRewardModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Math-RM-72B"
),
"Qwen2ForProcessRewardModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Math-PRM-7B"
),
"Qwen2ForProcessRewardModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Math-PRM-7B"
),
"Qwen2ForSequenceClassification"
:
_HfExamplesInfo
(
"jason9693/Qwen2.5-1.5B-apeach"
),
# noqa: E501
"Qwen2ForSequenceClassification"
:
_HfExamplesInfo
(
"jason9693/Qwen2.5-1.5B-apeach"
),
# noqa: E501
"RobertaModel"
:
_HfExamplesInfo
(
"sentence-transformers/stsb-roberta-base-v2"
),
# noqa: E501
"RobertaModel"
:
_HfExamplesInfo
(
"sentence-transformers/stsb-roberta-base-v2"
,
v0_only
=
True
),
# noqa: E501
"RobertaForMaskedLM"
:
_HfExamplesInfo
(
"sentence-transformers/all-roberta-large-v1"
),
# noqa: E501
"RobertaForMaskedLM"
:
_HfExamplesInfo
(
"sentence-transformers/all-roberta-large-v1"
,
v0_only
=
True
),
# noqa: E501
"XLMRobertaModel"
:
_HfExamplesInfo
(
"intfloat/multilingual-e5-small"
)
,
"XLMRobertaModel"
:
_HfExamplesInfo
(
"intfloat/multilingual-e5-small"
,
v0_only
=
True
),
# noqa: E501
# [Multimodal]
# [Multimodal]
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"royokong/e5-v"
),
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"royokong/e5-v"
),
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"TIGER-Lab/VLM2Vec-Full"
,
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"TIGER-Lab/VLM2Vec-Full"
,
...
@@ -297,10 +318,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
...
@@ -297,10 +318,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
_CROSS_ENCODER_EXAMPLE_MODELS
=
{
_CROSS_ENCODER_EXAMPLE_MODELS
=
{
# [Text-only]
# [Text-only]
"BertForSequenceClassification"
:
_HfExamplesInfo
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
),
# noqa: E501
"BertForSequenceClassification"
:
_HfExamplesInfo
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
v0_only
=
True
),
# noqa: E501
"RobertaForSequenceClassification"
:
_HfExamplesInfo
(
"cross-encoder/quora-roberta-base"
),
# noqa: E501
"RobertaForSequenceClassification"
:
_HfExamplesInfo
(
"cross-encoder/quora-roberta-base"
,
v0_only
=
True
),
# noqa: E501
"XLMRobertaForSequenceClassification"
:
_HfExamplesInfo
(
"BAAI/bge-reranker-v2-m3"
),
# noqa: E501
"XLMRobertaForSequenceClassification"
:
_HfExamplesInfo
(
"BAAI/bge-reranker-v2-m3"
,
v0_only
=
True
),
# noqa: E501
"ModernBertForSequenceClassification"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-reranker-modernbert-base"
),
# noqa: E501
"ModernBertForSequenceClassification"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-reranker-modernbert-base"
,
v0_only
=
True
),
# noqa: E501
}
}
_MULTIMODAL_EXAMPLE_MODELS
=
{
_MULTIMODAL_EXAMPLE_MODELS
=
{
...
@@ -308,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -308,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereForAI/aya-vision-8b"
),
# noqa: E501
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereForAI/aya-vision-8b"
),
# noqa: E501
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
,
# noqa: E501
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
,
# noqa: E501
extras
=
{
"6b"
:
"Salesforce/blip2-opt-6.7b"
},
# noqa: E501
extras
=
{
"6b"
:
"Salesforce/blip2-opt-6.7b"
}),
# noqa: E501
v0_only
=
True
),
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
extras
=
{
"fork"
:
"Isotr0py/deepseek-vl2-tiny"
},
# noqa: E501
extras
=
{
"fork"
:
"Isotr0py/deepseek-vl2-tiny"
},
# noqa: E501
...
@@ -322,6 +342,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -322,6 +342,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"GLM4VForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4v-9b"
,
"GLM4VForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4v-9b"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
"Glm4vForConditionalGeneration"
:
_HfExamplesInfo
(
"THUDM/GLM-4.1V-9B-Thinking"
,
min_transformers_version
=
"4.53"
),
# noqa: E501
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
extras
=
{
"2b"
:
"h2oai/h2ovl-mississippi-2b"
},
# noqa: E501
extras
=
{
"2b"
:
"h2oai/h2ovl-mississippi-2b"
},
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
...
@@ -332,10 +353,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -332,10 +353,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
"Kwai-Keye/Keye-VL-8B-Preview"
,
# noqa: E501
trust_remote_code
=
True
),
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-VL-A3B-Instruct"
,
# noqa: E501
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-VL-A3B-Instruct"
,
# noqa: E501
extras
=
{
"thinking"
:
"moonshotai/Kimi-VL-A3B-Thinking"
},
# noqa: E501
extras
=
{
"thinking"
:
"moonshotai/Kimi-VL-A3B-Thinking"
},
# noqa: E501
trust_remote_code
=
True
,
trust_remote_code
=
True
),
v0_only
=
True
),
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
# noqa: E501
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
# noqa: E501
max_model_len
=
10240
),
max_model_len
=
10240
),
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
...
@@ -394,6 +416,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -394,6 +416,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"TarsierForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier-7b"
,
# noqa: E501
"TarsierForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier-7b"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"TarsierForConditionalGeneration"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"TarsierForConditionalGeneration"
]}),
# noqa: E501
"Tarsier2ForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier2-Recap-7b"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
]}),
# noqa: E501
# [Encoder-decoder]
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
# Therefore, we borrow the BartTokenizer from the original Bart model
...
@@ -472,4 +496,4 @@ class HfExampleModels:
...
@@ -472,4 +496,4 @@ class HfExampleModels:
raise
ValueError
(
f
"No example model defined for
{
model_id
}
"
)
raise
ValueError
(
f
"No example model defined for
{
model_id
}
"
)
HF_EXAMPLE_MODELS
=
HfExampleModels
(
_EXAMPLE_MODELS
)
HF_EXAMPLE_MODELS
=
HfExampleModels
(
_EXAMPLE_MODELS
)
\ No newline at end of file
tests/models/test_initialization.py
View file @
99324e25
...
@@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
...
@@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
# FIXME: Possible memory leak in the previous tests?
# FIXME: Possible memory leak in the previous tests?
if
model_arch
==
"GraniteSpeechForConditionalGeneration"
:
if
model_arch
in
(
"GraniteSpeechForConditionalGeneration"
,
"KimiVLForConditionalGeneration"
):
pytest
.
skip
(
"Avoid OOM"
)
pytest
.
skip
(
"Avoid OOM"
)
# Avoid OOM and reduce initialization time by only using 1 layer
# Avoid OOM and reduce initialization time by only using 1 layer
...
@@ -31,12 +32,21 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
...
@@ -31,12 +32,21 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
text_config
=
hf_config
.
get_text_config
()
text_config
=
hf_config
.
get_text_config
()
# Ensure at least 2 expert per group
# Since `grouped_topk` assums top-2
n_group
=
getattr
(
text_config
,
'n_group'
,
None
)
num_experts
=
n_group
*
2
if
n_group
is
not
None
else
2
text_config
.
update
({
text_config
.
update
({
"num_layers"
:
1
,
"num_layers"
:
1
,
"num_hidden_layers"
:
1
,
"num_hidden_layers"
:
1
,
"num_experts"
:
2
,
"num_experts"
:
num_experts
,
"num_experts_per_tok"
:
2
,
"num_experts_per_tok"
:
2
,
"num_local_experts"
:
2
,
"num_local_experts"
:
num_experts
,
# Otherwise there will not be any expert layers
"first_k_dense_replace"
:
0
,
# To avoid OOM on DeepSeek-V3
"n_routed_experts"
:
num_experts
,
})
})
if
hasattr
(
hf_config
,
"vision_config"
):
if
hasattr
(
hf_config
,
"vision_config"
):
...
@@ -80,6 +90,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
...
@@ -80,6 +90,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
model_info
.
default
,
model_info
.
default
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
revision
=
model_info
.
revision
,
speculative_config
=
{
speculative_config
=
{
"model"
:
model_info
.
speculative_model
,
"model"
:
model_info
.
speculative_model
,
"num_speculative_tokens"
:
1
,
"num_speculative_tokens"
:
1
,
...
...
tests/models/test_oot_registration.py
View file @
99324e25
...
@@ -53,7 +53,9 @@ def test_oot_registration_embedding(
...
@@ -53,7 +53,9 @@ def test_oot_registration_embedding(
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_PLUGINS"
,
"register_dummy_model"
)
m
.
setenv
(
"VLLM_PLUGINS"
,
"register_dummy_model"
)
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
llm
=
LLM
(
model
=
dummy_gemma2_embedding_path
,
load_format
=
"dummy"
)
llm
=
LLM
(
model
=
dummy_gemma2_embedding_path
,
load_format
=
"dummy"
,
max_model_len
=
2048
)
outputs
=
llm
.
embed
(
prompts
)
outputs
=
llm
.
embed
(
prompts
)
for
output
in
outputs
:
for
output
in
outputs
:
...
...
tests/models/test_registry.py
View file @
99324e25
...
@@ -9,9 +9,9 @@ import torch.cuda
...
@@ -9,9 +9,9 @@ import torch.cuda
from
vllm.model_executor.models
import
(
is_pooling_model
,
from
vllm.model_executor.models
import
(
is_pooling_model
,
is_text_generation_model
,
is_text_generation_model
,
supports_multimodal
)
supports_multimodal
)
from
vllm.model_executor.models.adapters
import
(
as_
classification
_model
,
from
vllm.model_executor.models.adapters
import
(
as_
embedding
_model
,
as_
embedding
_model
,
as_
reward
_model
,
as_
reward
_model
)
as_
seq_cls
_model
)
from
vllm.model_executor.models.registry
import
(
_MULTIMODAL_MODELS
,
from
vllm.model_executor.models.registry
import
(
_MULTIMODAL_MODELS
,
_SPECULATIVE_DECODING_MODELS
,
_SPECULATIVE_DECODING_MODELS
,
_TEXT_GENERATION_MODELS
,
_TEXT_GENERATION_MODELS
,
...
@@ -43,7 +43,7 @@ def test_registry_imports(model_arch):
...
@@ -43,7 +43,7 @@ def test_registry_imports(model_arch):
assert
is_text_generation_model
(
model_cls
)
assert
is_text_generation_model
(
model_cls
)
# All vLLM models should be convertible to a pooling model
# All vLLM models should be convertible to a pooling model
assert
is_pooling_model
(
as_
classification
_model
(
model_cls
))
assert
is_pooling_model
(
as_
seq_cls
_model
(
model_cls
))
assert
is_pooling_model
(
as_embedding_model
(
model_cls
))
assert
is_pooling_model
(
as_embedding_model
(
model_cls
))
assert
is_pooling_model
(
as_reward_model
(
model_cls
))
assert
is_pooling_model
(
as_reward_model
(
model_cls
))
...
...
tests/models/utils.py
View file @
99324e25
...
@@ -336,3 +336,10 @@ class EmbedModelInfo(NamedTuple):
...
@@ -336,3 +336,10 @@ class EmbedModelInfo(NamedTuple):
architecture
:
str
=
""
architecture
:
str
=
""
dtype
:
str
=
"auto"
dtype
:
str
=
"auto"
enable_test
:
bool
=
True
enable_test
:
bool
=
True
class
RerankModelInfo
(
NamedTuple
):
name
:
str
architecture
:
str
=
""
dtype
:
str
=
"auto"
enable_test
:
bool
=
True
tests/mq_llm_engine/test_error_handling.py
View file @
99324e25
...
@@ -66,7 +66,7 @@ async def test_evil_forward(tmp_socket):
...
@@ -66,7 +66,7 @@ async def test_evil_forward(tmp_socket):
with
pytest
.
raises
(
MQEngineDeadError
):
with
pytest
.
raises
(
MQEngineDeadError
):
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
request_id
=
str
(
uuid
.
uuid4
())
)
:
pass
pass
assert
client
.
errored
assert
client
.
errored
...
@@ -115,7 +115,7 @@ async def test_failed_health_check(tmp_socket):
...
@@ -115,7 +115,7 @@ async def test_failed_health_check(tmp_socket):
with
pytest
.
raises
(
MQEngineDeadError
):
with
pytest
.
raises
(
MQEngineDeadError
):
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
request_id
=
str
(
uuid
.
uuid4
())
)
:
pass
pass
client
.
close
()
client
.
close
()
...
@@ -157,7 +157,7 @@ async def test_failed_abort(tmp_socket):
...
@@ -157,7 +157,7 @@ async def test_failed_abort(tmp_socket):
async
for
_
in
client
.
generate
(
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
request_id
=
uuid
.
uuid4
()):
request_id
=
str
(
uuid
.
uuid4
())
)
:
pass
pass
assert
"KeyError"
in
repr
(
execinfo
.
value
)
assert
"KeyError"
in
repr
(
execinfo
.
value
)
assert
client
.
errored
assert
client
.
errored
...
@@ -189,7 +189,7 @@ async def test_batch_error(tmp_socket):
...
@@ -189,7 +189,7 @@ async def test_batch_error(tmp_socket):
params
=
SamplingParams
(
min_tokens
=
2048
,
max_tokens
=
2048
)
params
=
SamplingParams
(
min_tokens
=
2048
,
max_tokens
=
2048
)
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
params
,
sampling_params
=
params
,
request_id
=
uuid
.
uuid4
()):
request_id
=
str
(
uuid
.
uuid4
())
)
:
pass
pass
tasks
=
[
asyncio
.
create_task
(
do_generate
(
client
))
for
_
in
range
(
10
)]
tasks
=
[
asyncio
.
create_task
(
do_generate
(
client
))
for
_
in
range
(
10
)]
...
@@ -289,7 +289,7 @@ async def test_engine_process_death(tmp_socket):
...
@@ -289,7 +289,7 @@ async def test_engine_process_death(tmp_socket):
with
pytest
.
raises
(
MQEngineDeadError
):
with
pytest
.
raises
(
MQEngineDeadError
):
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
request_id
=
str
(
uuid
.
uuid4
())
)
:
pass
pass
# And the health check should show the engine is dead
# And the health check should show the engine is dead
...
...
tests/multi_step/test_correctness_llm.py
View file @
99324e25
...
@@ -8,6 +8,7 @@ from typing import Optional
...
@@ -8,6 +8,7 @@ from typing import Optional
import
pytest
import
pytest
from
vllm.platforms
import
current_platform
from
vllm.utils
import
STR_BACKEND_ENV_VAR
from
vllm.utils
import
STR_BACKEND_ENV_VAR
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
...
@@ -71,6 +72,12 @@ def test_multi_step_llm(
...
@@ -71,6 +72,12 @@ def test_multi_step_llm(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned.
completions endpoint; `None` -> 1 logprob returned.
"""
"""
if
current_platform
.
is_rocm
()
and
\
(
attention_backend
==
"FLASHINFER"
or
enable_chunked_prefill
):
pytest
.
skip
(
"Multi-Step with FLASHINFER or Chunked-Prefill is not supported"
"on ROCm"
)
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
...
@@ -221,6 +228,9 @@ def test_multi_step_llm_w_prompt_logprobs(
...
@@ -221,6 +228,9 @@ def test_multi_step_llm_w_prompt_logprobs(
@
pytest
.
mark
.
parametrize
(
"num_prompts"
,
NUM_PROMPTS
)
@
pytest
.
mark
.
parametrize
(
"num_prompts"
,
NUM_PROMPTS
)
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
None
,
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
None
,
5
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASH_ATTN"
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASH_ATTN"
])
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Multi-Step + Chunked-Prefill not supported on ROCm"
)
def
test_multi_step_llm_chunked_prefill_prefix_cache
(
def
test_multi_step_llm_chunked_prefill_prefix_cache
(
vllm_runner
,
vllm_runner
,
example_prompts
,
example_prompts
,
...
...
tests/multimodal/test_hasher.py
View file @
99324e25
...
@@ -60,3 +60,15 @@ def test_hash_collision_array_shape():
...
@@ -60,3 +60,15 @@ def test_hash_collision_array_shape():
hasher
=
MultiModalHasher
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
data
=
arr1
)
!=
hasher
.
hash_kwargs
(
data
=
arr2
)
assert
hasher
.
hash_kwargs
(
data
=
arr1
)
!=
hasher
.
hash_kwargs
(
data
=
arr2
)
def
test_hash_non_contiguous_array
():
arr
=
np
.
arange
(
24
).
reshape
(
4
,
6
).
T
assert
not
arr
.
flags
.
c_contiguous
arr_c
=
np
.
ascontiguousarray
(
arr
)
assert
arr_c
.
flags
.
c_contiguous
hasher
=
MultiModalHasher
# Both should be hashable and produce the same hashes
assert
hasher
.
hash_kwargs
(
data
=
arr
)
==
hasher
.
hash_kwargs
(
data
=
arr_c
)
tests/multimodal/test_processing.py
View file @
99324e25
...
@@ -1086,6 +1086,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
...
@@ -1086,6 +1086,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
prompt
=
""
,
prompt
=
""
,
mm_data
=
{},
mm_data
=
{},
mm_kwargs
=
call_kwargs
,
mm_kwargs
=
call_kwargs
,
tok_kwargs
=
{},
)
)
assert
out_kwargs
==
expected_kwargs
assert
out_kwargs
==
expected_kwargs
tests/multimodal/test_utils.py
View file @
99324e25
...
@@ -167,12 +167,15 @@ async def test_fetch_image_error_conversion():
...
@@ -167,12 +167,15 @@ async def test_fetch_image_error_conversion():
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
async
def
test_fetch_video_http
(
video_url
:
str
,
num_frames
:
int
):
async
def
test_fetch_video_http
(
video_url
:
str
,
num_frames
:
int
):
connector
=
MediaConnector
()
connector
=
MediaConnector
(
media_io_kwargs
=
{
"video"
:
{
"num_frames"
:
num_frames
,
}})
video_sync
=
connector
.
fetch_video
(
video_url
,
num_frames
=
num_frames
)
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
=
await
connector
.
fetch_video_async
(
video_url
,
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
num_frames
=
num_frames
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
...
...
tests/multimodal/test_video.py
View file @
99324e25
...
@@ -4,7 +4,10 @@ import numpy as np
...
@@ -4,7 +4,10 @@ import numpy as np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
import
pytest
import
pytest
from
vllm.multimodal.video
import
VIDEO_LOADER_REGISTRY
,
VideoLoader
from
vllm
import
envs
from
vllm.multimodal.image
import
ImageMediaIO
from
vllm.multimodal.video
import
(
VIDEO_LOADER_REGISTRY
,
VideoLoader
,
VideoMediaIO
)
NUM_FRAMES
=
10
NUM_FRAMES
=
10
FAKE_OUTPUT_1
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
FAKE_OUTPUT_1
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
...
@@ -40,3 +43,46 @@ def test_video_loader_registry():
...
@@ -40,3 +43,46 @@ def test_video_loader_registry():
def
test_video_loader_type_doesnt_exist
():
def
test_video_loader_type_doesnt_exist
():
with
pytest
.
raises
(
AssertionError
):
with
pytest
.
raises
(
AssertionError
):
VIDEO_LOADER_REGISTRY
.
load
(
"non_existing_video_loader"
)
VIDEO_LOADER_REGISTRY
.
load
(
"non_existing_video_loader"
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"assert_10_frames_1_fps"
)
class
Assert10Frames1FPSVideoLoader
(
VideoLoader
):
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
fps
:
float
=
-
1.0
,
**
kwargs
)
->
npt
.
NDArray
:
assert
num_frames
==
10
,
"bad num_frames"
assert
fps
==
1.0
,
"bad fps"
return
FAKE_OUTPUT_2
def
test_video_media_io_kwargs
():
envs
.
VLLM_VIDEO_LOADER_BACKEND
=
"assert_10_frames_1_fps"
imageio
=
ImageMediaIO
()
# Verify that different args pass/fail assertions as expected.
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
1.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
1.0
,
"not_used"
:
"not_used"
})
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad num_frames"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{})
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad num_frames"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
9
,
"fps"
:
1.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad fps"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
2.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
tests/neuron/1_core/test_prefix_prefill.py
View file @
99324e25
...
@@ -7,6 +7,8 @@ import pytest
...
@@ -7,6 +7,8 @@ import pytest
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
vllm.utils
import
cdiv
class
BlockDiagonalCausalFromBottomRightMask
:
class
BlockDiagonalCausalFromBottomRightMask
:
...
@@ -398,11 +400,8 @@ def test_contexted_kv_attention(
...
@@ -398,11 +400,8 @@ def test_contexted_kv_attention(
assert
(
large_tile_size
>=
B_P_SIZE
assert
(
large_tile_size
>=
B_P_SIZE
),
f
"Expect
{
large_tile_size
=
}
to be larger than
{
B_P_SIZE
=
}
"
),
f
"Expect
{
large_tile_size
=
}
to be larger than
{
B_P_SIZE
=
}
"
def
ceil_div
(
a
,
b
):
return
(
a
+
b
-
1
)
//
b
def
pad_to_multiple
(
a
,
b
):
def
pad_to_multiple
(
a
,
b
):
return
c
eil_
div
(
a
,
b
)
*
b
return
cdiv
(
a
,
b
)
*
b
def
pad_to_next_power_of_2
(
a
):
def
pad_to_next_power_of_2
(
a
):
assert
a
>
0
assert
a
>
0
...
@@ -411,7 +410,7 @@ def test_contexted_kv_attention(
...
@@ -411,7 +410,7 @@ def test_contexted_kv_attention(
# calculate input shapes
# calculate input shapes
max_num_queries
=
pad_to_next_power_of_2
(
sum
(
query_lens
))
max_num_queries
=
pad_to_next_power_of_2
(
sum
(
query_lens
))
context_lens
=
torch
.
tensor
(
seq_lens
)
-
torch
.
tensor
(
query_lens
)
context_lens
=
torch
.
tensor
(
seq_lens
)
-
torch
.
tensor
(
query_lens
)
num_active_blocks
=
c
eil_
div
(
context_lens
,
block_size
).
sum
().
item
()
num_active_blocks
=
cdiv
(
context_lens
,
block_size
).
sum
().
item
()
num_active_blocks
=
pad_to_multiple
(
num_active_blocks
,
num_active_blocks
=
pad_to_multiple
(
num_active_blocks
,
large_tile_size
//
block_size
)
large_tile_size
//
block_size
)
context_kv_len
=
num_active_blocks
*
block_size
context_kv_len
=
num_active_blocks
*
block_size
...
...
tests/plugins/vllm_add_dummy_platform/setup.py
View file @
99324e25
...
@@ -10,5 +10,7 @@ setup(
...
@@ -10,5 +10,7 @@ setup(
entry_points
=
{
entry_points
=
{
'vllm.platform_plugins'
:
[
'vllm.platform_plugins'
:
[
"dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"
# noqa
"dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"
# noqa
]
],
"vllm.general_plugins"
:
[
"dummy_custom_ops = vllm_add_dummy_platform:register_ops"
],
})
})
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
View file @
99324e25
...
@@ -6,3 +6,7 @@ from typing import Optional
...
@@ -6,3 +6,7 @@ from typing import Optional
def
dummy_platform_plugin
()
->
Optional
[
str
]:
def
dummy_platform_plugin
()
->
Optional
[
str
]:
return
"vllm_add_dummy_platform.dummy_platform.DummyPlatform"
return
"vllm_add_dummy_platform.dummy_platform.DummyPlatform"
def
register_ops
():
import
vllm_add_dummy_platform.dummy_custom_ops
# noqa
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.attention.backends.flash_attn
import
FlashAttentionBackend
from
vllm.attention.backends.placeholder_attn
import
(
PlaceholderAttentionBackend
)
class
DummyAttentionBackend
(
F
la
sh
AttentionBackend
):
class
DummyAttentionBackend
(
P
la
ceholder
AttentionBackend
):
@
staticmethod
@
staticmethod
def
get_name
()
->
str
:
def
get_name
()
->
str
:
...
...
Prev
1
…
15
16
17
18
19
20
21
22
23
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment