Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7a985548
Commit
7a985548
authored
May 22, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.0' into v0.9.0-ori
parents
45d3785c
dc1440cf
Changes
486
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
248 additions
and
68 deletions
+248
-68
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen2_vl.py
+2
-2
tests/models/multimodal/processing/test_smolvlm.py
tests/models/multimodal/processing/test_smolvlm.py
+2
-2
tests/models/quantization/__init__.py
tests/models/quantization/__init__.py
+0
-0
tests/models/quantization/test_aqlm.py
tests/models/quantization/test_aqlm.py
+4
-7
tests/models/quantization/test_awq.py
tests/models/quantization/test_awq.py
+3
-4
tests/models/quantization/test_bitblas.py
tests/models/quantization/test_bitblas.py
+1
-3
tests/models/quantization/test_fp8.py
tests/models/quantization/test_fp8.py
+9
-6
tests/models/quantization/test_gguf.py
tests/models/quantization/test_gguf.py
+3
-4
tests/models/quantization/test_gptq_bitblas.py
tests/models/quantization/test_gptq_bitblas.py
+1
-3
tests/models/quantization/test_gptq_marlin.py
tests/models/quantization/test_gptq_marlin.py
+7
-6
tests/models/quantization/test_gptq_marlin_24.py
tests/models/quantization/test_gptq_marlin_24.py
+5
-5
tests/models/quantization/test_modelopt.py
tests/models/quantization/test_modelopt.py
+0
-1
tests/models/quantization/test_mxfp4.py
tests/models/quantization/test_mxfp4.py
+40
-0
tests/models/quantization/test_nvfp4.py
tests/models/quantization/test_nvfp4.py
+0
-1
tests/models/registry.py
tests/models/registry.py
+35
-19
tests/models/test_transformers.py
tests/models/test_transformers.py
+9
-4
tests/models/utils.py
tests/models/utils.py
+66
-1
tests/multimodal/assets/image1.png
tests/multimodal/assets/image1.png
+0
-0
tests/multimodal/assets/image2.png
tests/multimodal/assets/image2.png
+0
-0
tests/multimodal/test_hasher.py
tests/multimodal/test_hasher.py
+61
-0
No files found.
Too many changes to show.
To preserve performance only
486 of 486+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/processing/test_qwen2_vl.py
View file @
7a985548
...
...
@@ -4,7 +4,7 @@ import pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
_
ImageAssets
from
....conftest
import
Image
Test
Assets
from
...utils
import
build_model_context
...
...
@@ -19,7 +19,7 @@ from ...utils import build_model_context
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
...
...
tests/models/multimodal/processing/test_smolvlm.py
View file @
7a985548
...
...
@@ -5,7 +5,7 @@ from transformers import SmolVLMConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
_
ImageAssets
from
....conftest
import
Image
Test
Assets
from
...utils
import
build_model_context
...
...
@@ -21,7 +21,7 @@ from ...utils import build_model_context
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
...
...
tests/models/
embedding/language
/__init__.py
→
tests/models/
quantization
/__init__.py
View file @
7a985548
File moved
tests/models/
decoder_only/language
/test_aqlm.py
→
tests/models/
quantization
/test_aqlm.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of a AQLM model between vLLM and HF Transformers
Run `pytest tests/models/test_aqlm.py`.
"""
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.platforms
import
current_platform
# These ground truth generations were generated using `transformers==4.38.1
# aqlm==1.1.0 torch==2.2.0`
...
...
@@ -39,8 +35,9 @@ ground_truth_generations = [
]
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"aqlm"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"aqlm"
)
or
current_platform
.
is_rocm
()
or
not
current_platform
.
is_cuda
(),
reason
=
"AQLM is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/
decoder_only/vision_language
/test_awq.py
→
tests/models/
quantization
/test_awq.py
View file @
7a985548
...
...
@@ -7,8 +7,8 @@ import torch
from
vllm.multimodal.image
import
rescale_image_size
from
...
.
conftest
import
IMAGE_ASSETS
,
VllmRunner
,
_
ImageAssets
from
..
.
utils
import
check_logprobs_close
from
...conftest
import
IMAGE_ASSETS
,
Image
Test
Assets
,
VllmRunner
from
..utils
import
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
...
...
@@ -20,7 +20,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
def
run_awq_test
(
vllm_runner
:
type
[
VllmRunner
],
image_assets
:
_
ImageAssets
,
image_assets
:
Image
Test
Assets
,
source_model
:
str
,
quant_model
:
str
,
*
,
...
...
@@ -85,7 +85,6 @@ def run_awq_test(
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
parametrize
(
(
"source_model"
,
"quant_model"
),
[(
"OpenGVLab/InternVL2-2B"
,
"OpenGVLab/InternVL2-2B-AWQ"
)],
...
...
tests/models/test_bitblas.py
→
tests/models/
quantization/
test_bitblas.py
View file @
7a985548
...
...
@@ -8,14 +8,12 @@ bitblas/GPTQ models are in the top 3 selections of each other.
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.
Run `pytest tests/models/test_bitblas.py`.
"""
from
dataclasses
import
dataclass
import
pytest
from
.utils
import
check_logprobs_close
from
.
.utils
import
check_logprobs_close
@
dataclass
...
...
tests/models/
decoder_only/language
/test_fp8.py
→
tests/models/
quantization
/test_fp8.py
View file @
7a985548
...
...
@@ -4,20 +4,15 @@
"""Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU.
"""
import
os
from
typing
import
Optional
import
pytest
from
tests.kernels.utils
import
override_backend_env_variable
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.platforms
import
current_platform
from
vllm.utils
import
STR_BACKEND_ENV_VAR
from
..
.
utils
import
check_logprobs_close
from
..utils
import
check_logprobs_close
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
...
...
@@ -60,6 +55,14 @@ def test_models(
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
if
backend
==
"FLASHINFER"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flashinfer does not support ROCm/HIP."
)
if
kv_cache_dtype
==
"fp8_e5m2"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
f
"
{
kv_cache_dtype
}
is currently not supported on ROCm/HIP."
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"TOKENIZERS_PARALLELISM"
,
'true'
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
backend
)
...
...
tests/models/
decoder_only/language
/test_gguf.py
→
tests/models/
quantization
/test_gguf.py
View file @
7a985548
...
...
@@ -14,9 +14,9 @@ from transformers import AutoTokenizer
from
tests.quantization.utils
import
is_quant_method_supported
from
...
.
conftest
import
VllmRunner
from
...
.
utils
import
multi_gpu_test
from
..
.
utils
import
check_logprobs_close
from
...conftest
import
VllmRunner
from
...utils
import
multi_gpu_test
from
..utils
import
check_logprobs_close
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
...
...
@@ -38,7 +38,6 @@ LLAMA_CONFIG = GGUFTestConfig(
original_model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
gguf_repo
=
"bartowski/Llama-3.2-1B-Instruct-GGUF"
,
gguf_filename
=
"Llama-3.2-1B-Instruct-IQ4_XS.gguf"
,
marks
=
[
pytest
.
mark
.
quant_model
],
)
QWEN2_CONFIG
=
GGUFTestConfig
(
...
...
tests/models/test_gptq_bitblas.py
→
tests/models/
quantization/
test_gptq_bitblas.py
View file @
7a985548
...
...
@@ -8,14 +8,12 @@ bitblas/GPTQ models are in the top 3 selections of each other.
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.
Run `pytest tests/models/test_bitblas.py`.
"""
from
dataclasses
import
dataclass
import
pytest
from
.utils
import
check_logprobs_close
from
.
.utils
import
check_logprobs_close
@
dataclass
...
...
tests/models/
decoder_only/language
/test_gptq_marlin.py
→
tests/models/
quantization
/test_gptq_marlin.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Compares the outputs of gptq vs gptq_marlin
"""Compares the outputs of gptq vs gptq_marlin.
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 5 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
Run `pytest tests/models/test_gptq_marlin.py`.
"""
import
os
...
...
@@ -15,8 +14,9 @@ import pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.model_executor.layers.rotary_embedding
import
_ROPE_DICT
from
vllm.platforms
import
current_platform
from
..
.
utils
import
check_logprobs_close
from
..utils
import
check_logprobs_close
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
...
...
@@ -34,9 +34,10 @@ MODELS = [
]
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
flaky
(
reruns
=
3
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
current_platform
.
is_rocm
()
or
not
current_platform
.
is_cuda
(),
reason
=
"gptq_marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"bfloat16"
])
...
...
tests/models/
decoder_only/language
/test_gptq_marlin_24.py
→
tests/models/
quantization
/test_gptq_marlin_24.py
View file @
7a985548
...
...
@@ -4,16 +4,15 @@
Note: GPTQ and Marlin_24 do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
Run `pytest tests/models/test_marlin_24.py`.
"""
from
dataclasses
import
dataclass
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.platforms
import
current_platform
from
..
.
utils
import
check_logprobs_close
from
..utils
import
check_logprobs_close
@
dataclass
...
...
@@ -39,9 +38,10 @@ model_pairs = [
]
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin_24"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin_24"
)
or
current_platform
.
is_rocm
()
or
not
current_platform
.
is_cuda
(),
reason
=
"Marlin24 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/
decoder_only/language
/test_modelopt.py
→
tests/models/
quantization
/test_modelopt.py
View file @
7a985548
...
...
@@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = {
@
pytest
.
mark
.
skip
(
reason
=
"Prevent unstable test based on golden strings from breaking the build."
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
...
...
tests/models/quantization/test_mxfp4.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa
"""Tests Quark mxfp4 models against ground truth generation
"""
import
pytest
from
vllm
import
LLM
,
SamplingParams
MODELS
=
[
"amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"
]
EXPECTED_STRS_MAP
=
{
"amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"
:
[
'
\n
### Key Features
\n\n
* **High-throughput Inference**: vLL'
,
'
\n
Artificial intelligence (AI) has evolved significantly since its inception in the 1'
,
'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been'
,
'A neural network is a machine learning model inspired by the structure of the human brain. It consists of'
,
'
\n
Title: The Dreaming Robot
\n\n
As the sun set on the bustling metropol'
,
'
\n
The COVID-19 pandemic has had a profound impact on global economic structures and business'
,
'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th'
,
" everybody knows this proverbial saying, but did you know that it's not entirely accurate?"
,
]
}
@
pytest
.
mark
.
skip
(
reason
=
"Model to be released in the future"
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
example_prompts
,
model_name
)
->
None
:
sampling_params
=
SamplingParams
(
max_tokens
=
20
,
temperature
=
0
)
llm
=
LLM
(
model
=
model_name
,
kv_cache_dtype
=
"fp8"
,
quantization
=
"quark"
,
)
outputs
=
llm
.
generate
(
example_prompts
,
sampling_params
)
for
i
,
output
in
enumerate
(
outputs
):
output_str
=
output
.
outputs
[
0
].
text
expected_str
=
EXPECTED_STRS_MAP
[
model_name
][
i
]
assert
expected_str
==
output_str
,
(
f
"Expected:
{
expected_str
!
r
}
\n
vLLM:
{
output_str
!
r
}
"
)
tests/models/
decoder_only/language
/test_nvfp4.py
→
tests/models/
quantization
/test_nvfp4.py
View file @
7a985548
...
...
@@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = {
reason
=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system."
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"nvfp4"
),
reason
=
"nvfp4 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
...
...
tests/models/registry.py
View file @
7a985548
...
...
@@ -72,12 +72,15 @@ class _HfExamplesInfo:
return
current_version
=
TRANSFORMERS_VERSION
cur_base_version
=
Version
(
current_version
).
base_version
min_version
=
self
.
min_transformers_version
max_version
=
self
.
max_transformers_version
msg
=
f
"`transformers==
{
current_version
}
` installed, but `transformers"
if
min_version
and
Version
(
current_version
)
<
Version
(
min_version
):
# Only check the base version for the min/max version, otherwise preview
# models cannot be run because `x.yy.0.dev0`<`x.yy.0`
if
min_version
and
Version
(
cur_base_version
)
<
Version
(
min_version
):
msg
+=
f
">=
{
min_version
}
` is required to run this model."
elif
max_version
and
Version
(
cur
rent
_version
)
>
Version
(
max_version
):
elif
max_version
and
Version
(
cur
_base
_version
)
>
Version
(
max_version
):
msg
+=
f
"<=
{
max_version
}
` is required to run this model."
else
:
return
...
...
@@ -120,7 +123,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"BaichuanForCausalLM"
:
_HfExamplesInfo
(
"baichuan-inc/Baichuan2-7B-chat"
,
trust_remote_code
=
True
),
"BambaForCausalLM"
:
_HfExamplesInfo
(
"ibm-ai-platform/Bamba-9B"
),
"BambaForCausalLM"
:
_HfExamplesInfo
(
"ibm-ai-platform/Bamba-9B"
,
extras
=
{
"tiny"
:
"hmellor/bamba-tiny-random"
}),
# noqa: E501
"BloomForCausalLM"
:
_HfExamplesInfo
(
"bigscience/bloom-560m"
,
{
"1b"
:
"bigscience/bloomz-1b1"
}),
"ChatGLMModel"
:
_HfExamplesInfo
(
"THUDM/chatglm3-6b"
,
...
...
@@ -162,6 +166,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{
"1b"
:
"EleutherAI/pythia-1.4b"
}),
"GraniteForCausalLM"
:
_HfExamplesInfo
(
"ibm/PowerLM-3b"
),
"GraniteMoeForCausalLM"
:
_HfExamplesInfo
(
"ibm/PowerMoE-3b"
),
"GraniteMoeHybridForCausalLM"
:
_HfExamplesInfo
(
"ibm-granite/granite-4.0-tiny-preview"
,
# noqa: E501
min_transformers_version
=
"4.52.0"
),
# noqa: E501
"GraniteMoeSharedForCausalLM"
:
_HfExamplesInfo
(
"ibm-research/moe-7b-1b-active-shared-experts"
),
# noqa: E501
"Grok1ModelForCausalLM"
:
_HfExamplesInfo
(
"hpcai-tech/grok-1"
,
trust_remote_code
=
True
),
...
...
@@ -176,7 +182,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"JAISLMHeadModel"
:
_HfExamplesInfo
(
"inceptionai/jais-13b-chat"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
"ai21labs/AI21-Jamba-1.5-Mini"
,
extras
=
{
"tiny"
:
"ai21labs/Jamba-tiny-dev"
}),
# noqa: E501
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-1B-Instruct"
),
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-1B-Instruct"
,
extras
=
{
"guard"
:
"meta-llama/Llama-Guard-3-1B"
,
# noqa: E501
"hermes"
:
"NousResearch/Hermes-3-Llama-3.1-8B"
}),
# noqa: E501
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
"decapoda-research/llama-7b-hf"
,
is_available_online
=
False
),
"MambaForCausalLM"
:
_HfExamplesInfo
(
"state-spaces/mamba-130m-hf"
),
...
...
@@ -191,13 +199,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"MistralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-7B-Instruct-v0.1"
),
"MixtralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
# noqa: E501
{
"
falcon3"
:
"ehristoforu/Falcon3-MoE-2x7B-Insruct
"
}),
# noqa: E501
{
"
tiny"
:
"TitanML/tiny-mixtral
"
}),
# noqa: E501
"QuantMixtralForCausalLM"
:
_HfExamplesInfo
(
"mistral-community/Mixtral-8x22B-v0.1-AWQ"
),
# noqa: E501
"MptForCausalLM"
:
_HfExamplesInfo
(
"mpt"
,
is_available_online
=
False
),
"MPTForCausalLM"
:
_HfExamplesInfo
(
"mosaicml/mpt-7b"
),
"NemotronForCausalLM"
:
_HfExamplesInfo
(
"nvidia/Minitron-8B-Base"
),
"OlmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/OLMo-1B-hf"
),
"Olmo2ForCausalLM"
:
_HfExamplesInfo
(
"
shanearora/OLMo-7B-1124-hf
"
),
"Olmo2ForCausalLM"
:
_HfExamplesInfo
(
"
allenai/OLMo-2-0425-1B
"
),
"OlmoeForCausalLM"
:
_HfExamplesInfo
(
"allenai/OLMoE-1B-7B-0924-Instruct"
),
"OPTForCausalLM"
:
_HfExamplesInfo
(
"facebook/opt-125m"
,
{
"1b"
:
"facebook/opt-iml-max-1.3b"
}),
...
...
@@ -217,16 +225,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen2-0.5B-Instruct"
,
extras
=
{
"2.5"
:
"Qwen/Qwen2.5-0.5B-Instruct"
}),
# noqa: E501
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
,
is_available_online
=
False
,
min_transformers_version
=
"4.51"
),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-MoE-15B-A2B"
,
is_available_online
=
False
,
min_transformers_version
=
"4.51"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-30B-A3B"
),
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
,
is_available_online
=
False
),
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
,
# noqa: E501
...
...
@@ -242,6 +242,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
is_available_online
=
False
,
trust_remote_code
=
True
),
"Zamba2ForCausalLM"
:
_HfExamplesInfo
(
"Zyphra/Zamba2-7B-instruct"
),
"MiMoForCausalLM"
:
_HfExamplesInfo
(
"XiaomiMiMo/MiMo-7B-RL"
,
trust_remote_code
=
True
),
# [Encoder-decoder]
"BartModel"
:
_HfExamplesInfo
(
"facebook/bart-base"
),
"BartForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/bart-large-cnn"
),
...
...
@@ -254,11 +256,17 @@ _EMBEDDING_EXAMPLE_MODELS = {
"GritLM"
:
_HfExamplesInfo
(
"parasail-ai/GritLM-7B-vllm"
),
"GteModel"
:
_HfExamplesInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
trust_remote_code
=
True
),
"GteNewModel"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-base-en-v1.5"
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]}),
"InternLM2ForRewardModel"
:
_HfExamplesInfo
(
"internlm/internlm2-1_8b-reward"
,
trust_remote_code
=
True
),
"JambaForSequenceClassification"
:
_HfExamplesInfo
(
"ai21labs/Jamba-tiny-reward-dev"
),
# noqa: E501
"LlamaModel"
:
_HfExamplesInfo
(
"llama"
,
is_available_online
=
False
),
"MistralModel"
:
_HfExamplesInfo
(
"intfloat/e5-mistral-7b-instruct"
),
"ModernBertModel"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
trust_remote_code
=
True
),
"NomicBertModel"
:
_HfExamplesInfo
(
"Snowflake/snowflake-arctic-embed-m-long"
,
# noqa: E501
trust_remote_code
=
True
),
"Qwen2Model"
:
_HfExamplesInfo
(
"ssmits/Qwen2-7B-Instruct-embed-base"
),
...
...
@@ -337,6 +345,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-Llama3-V-2_5"
,
extras
=
{
"2.6"
:
"openbmb/MiniCPM-V-2_6"
},
# noqa: E501
trust_remote_code
=
True
),
"MiniMaxVL01ForConditionalGeneration"
:
_HfExamplesInfo
(
"MiniMaxAI/MiniMax-VL-01"
,
# noqa: E501
trust_remote_code
=
True
),
"Mistral3ForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
,
# noqa: E501
extras
=
{
"fp8"
:
"nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
}),
# noqa: E501
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/Molmo-7B-D-0924"
,
...
...
@@ -353,6 +363,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
}),
# noqa: E501
"Ovis"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2-1B"
,
trust_remote_code
=
True
,
extras
=
{
"1.6-llama"
:
"AIDC-AI/Ovis1.6-Llama3.2-3B"
,
"1.6-gemma"
:
"AIDC-AI/Ovis1.6-Gemma2-9B"
}),
# noqa: E501
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
trust_remote_code
=
True
),
"PixtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Pixtral-12B-2409"
,
# noqa: E501
...
...
@@ -364,8 +377,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-Audio-7B-Instruct"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-VL-2B-Instruct"
),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
),
# noqa: E501
"Qwen2_5OmniModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Omni-
7
B"
,
# noqa: E501
min_transformers_version
=
"4.52"
),
# noqa: E501
"Qwen2_5OmniModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Omni-
3
B"
,
min_transformers_version
=
"4.52"
),
"SkyworkR1VChatModel"
:
_HfExamplesInfo
(
"Skywork/Skywork-R1V-38B"
),
"SmolVLMForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
),
# noqa: E501
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# noqa: E501
...
...
@@ -375,7 +388,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# Therefore, we borrow the BartTokenizer from the original Bart model
"Florence2ForConditionalGeneration"
:
_HfExamplesInfo
(
"microsoft/Florence-2-base"
,
# noqa: E501
tokenizer
=
"Isotr0py/Florence-2-tokenizer"
,
trust_remote_code
=
True
),
# noqa: E501
trust_remote_code
=
True
,
),
# noqa: E501
"MllamaForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
# noqa: E501
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
),
# noqa: E501
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3"
),
# noqa: E501
...
...
@@ -399,6 +412,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
speculative_model
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
,
tokenizer
=
"meta-llama/Llama-3.1-8B-Instruct"
),
"MiMoMTPModel"
:
_HfExamplesInfo
(
"XiaomiMiMo/MiMo-7B-RL"
,
trust_remote_code
=
True
,
speculative_model
=
"XiaomiMiMo/MiMo-7B-RL"
)
}
_TRANSFORMERS_MODELS
=
{
...
...
tests/models/test_transformers.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Test the functionality of the Transformers backend.
Run `pytest tests/models/test_transformers.py`.
"""
"""Test the functionality of the Transformers backend."""
import
pytest
from
vllm.platforms
import
current_platform
from
..conftest
import
HfRunner
,
VllmRunner
from
..utils
import
multi_gpu_test
from
.utils
import
check_logprobs_close
...
...
@@ -36,6 +35,9 @@ def check_implementation(
)
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault."
)
@
pytest
.
mark
.
parametrize
(
"model,model_impl"
,
[
...
...
@@ -67,6 +69,9 @@ def test_distributed(
"meta-llama/Llama-3.2-1B-Instruct"
,
**
kwargs
)
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"bitsandbytes quantization is currently not supported in rocm."
)
@
pytest
.
mark
.
parametrize
(
"model, quantization_kwargs"
,
[
(
"meta-llama/Llama-3.2-1B-Instruct"
,
...
...
tests/models/utils.py
View file @
7a985548
...
...
@@ -2,9 +2,10 @@
import
warnings
from
collections.abc
import
Sequence
from
typing
import
Any
,
Optional
,
Union
from
typing
import
TYPE_CHECKING
,
Any
,
NamedTuple
,
Optional
,
Union
import
torch
import
torch.nn.functional
as
F
from
vllm.config
import
ModelConfig
,
TaskOption
from
vllm.inputs
import
InputContext
...
...
@@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from
.registry
import
HF_EXAMPLE_MODELS
if
TYPE_CHECKING
:
from
..conftest
import
HfRunner
TokensText
=
tuple
[
list
[
int
],
str
]
...
...
@@ -291,3 +295,64 @@ def build_model_context(
**
model_config_kwargs
,
)
return
InputContext
(
model_config
)
def
check_embeddings_close
(
*
,
embeddings_0_lst
:
Sequence
[
list
[
float
]],
embeddings_1_lst
:
Sequence
[
list
[
float
]],
name_0
:
str
,
name_1
:
str
,
tol
:
float
=
1e-3
,
)
->
None
:
assert
len
(
embeddings_0_lst
)
==
len
(
embeddings_1_lst
)
for
prompt_idx
,
(
embeddings_0
,
embeddings_1
)
in
enumerate
(
zip
(
embeddings_0_lst
,
embeddings_1_lst
)):
assert
len
(
embeddings_0
)
==
len
(
embeddings_1
),
(
f
"Length mismatch:
{
len
(
embeddings_0
)
}
vs.
{
len
(
embeddings_1
)
}
"
)
sim
=
F
.
cosine_similarity
(
torch
.
tensor
(
embeddings_0
),
torch
.
tensor
(
embeddings_1
),
dim
=
0
)
fail_msg
=
(
f
"Test
{
prompt_idx
}
:"
f
"
\n
{
name_0
}
:
\t
{
embeddings_0
[:
16
]
!
r
}
"
f
"
\n
{
name_1
}
:
\t
{
embeddings_1
[:
16
]
!
r
}
"
)
assert
sim
>=
1
-
tol
,
fail_msg
def
matryoshka_fy
(
tensor
:
torch
.
Tensor
,
dimensions
:
int
):
tensor
=
torch
.
tensor
(
tensor
)
tensor
=
tensor
[...,
:
dimensions
]
tensor
=
F
.
normalize
(
tensor
,
p
=
2
,
dim
=
1
)
return
tensor
class
EmbedModelInfo
(
NamedTuple
):
name
:
str
is_matryoshka
:
bool
=
False
matryoshka_dimensions
:
Optional
[
list
[
int
]]
=
None
architecture
:
str
=
""
dtype
:
str
=
"auto"
enable_test
:
bool
=
True
def
run_embedding_correctness_test
(
hf_model
:
"HfRunner"
,
inputs
:
list
[
str
],
vllm_outputs
:
Sequence
[
list
[
float
]],
dimensions
:
Optional
[
int
]
=
None
,
):
hf_outputs
=
hf_model
.
encode
(
inputs
)
if
dimensions
:
hf_outputs
=
matryoshka_fy
(
hf_outputs
,
dimensions
)
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
tests/multimodal/assets/image1.png
0 → 100644
View file @
7a985548
1.79 KB
tests/multimodal/assets/image2.png
0 → 100644
View file @
7a985548
1.79 KB
tests/multimodal/test_hasher.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
from
pathlib
import
Path
import
numpy
as
np
import
pytest
import
torch
from
PIL
import
Image
,
ImageDraw
from
vllm.multimodal.hasher
import
MultiModalHasher
ASSETS_DIR
=
Path
(
__file__
).
parent
/
"assets"
assert
ASSETS_DIR
.
exists
()
# NOTE: Images that are the same visually are allowed to have the same hash
@
pytest
.
mark
.
parametrize
(
"mode_pair"
,
[(
"1"
,
"L"
),
(
"RGBA"
,
"CMYK"
)])
def
test_hash_collision_image_mode
(
mode_pair
):
mode1
,
mode2
=
mode_pair
image1
=
Image
.
new
(
mode1
,
size
=
(
10
,
10
),
color
=
1
)
image2
=
Image
.
new
(
mode2
,
size
=
(
10
,
10
),
color
=
1
)
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
image
=
image1
)
!=
hasher
.
hash_kwargs
(
image
=
image2
)
def
test_hash_collision_image_palette
():
# These images differ only in Image.palette._palette
image1
=
Image
.
open
(
ASSETS_DIR
/
"image1.png"
)
image2
=
Image
.
open
(
ASSETS_DIR
/
"image2.png"
)
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
image
=
image1
)
!=
hasher
.
hash_kwargs
(
image
=
image2
)
def
test_hash_collision_image_transpose
():
image1
=
Image
.
new
(
"1"
,
size
=
(
10
,
20
))
ImageDraw
.
Draw
(
image1
).
line
([(
0
,
0
),
(
10
,
0
)])
image2
=
Image
.
new
(
"1"
,
size
=
(
20
,
10
))
ImageDraw
.
Draw
(
image2
).
line
([(
0
,
0
),
(
0
,
10
)])
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
image
=
image1
)
!=
hasher
.
hash_kwargs
(
image
=
image2
)
def
test_hash_collision_tensor_shape
():
# The hash should be different though the data is the same when flattened
arr1
=
torch
.
zeros
((
5
,
10
,
20
,
3
))
arr2
=
torch
.
zeros
((
10
,
20
,
5
,
3
))
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
data
=
arr1
)
!=
hasher
.
hash_kwargs
(
data
=
arr2
)
def
test_hash_collision_array_shape
():
# The hash should be different though the data is the same when flattened
arr1
=
np
.
zeros
((
5
,
10
,
20
,
3
))
arr2
=
np
.
zeros
((
10
,
20
,
5
,
3
))
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
data
=
arr1
)
!=
hasher
.
hash_kwargs
(
data
=
arr2
)
Prev
1
…
18
19
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment