Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
233 additions
and
104 deletions
+233
-104
tests/models/language/generation_ppl_test/test_gpt.py
tests/models/language/generation_ppl_test/test_gpt.py
+14
-0
tests/models/language/generation_ppl_test/test_qwen.py
tests/models/language/generation_ppl_test/test_qwen.py
+21
-0
tests/models/language/pooling/embed_utils.py
tests/models/language/pooling/embed_utils.py
+2
-5
tests/models/language/pooling/test_embedding.py
tests/models/language/pooling/test_embedding.py
+9
-6
tests/models/language/pooling_mteb_test/__init__.py
tests/models/language/pooling_mteb_test/__init__.py
+0
-0
tests/models/language/pooling_mteb_test/mteb_utils.py
tests/models/language/pooling_mteb_test/mteb_utils.py
+76
-22
tests/models/language/pooling_mteb_test/test_baai.py
tests/models/language/pooling_mteb_test/test_baai.py
+10
-4
tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
.../language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+5
-5
tests/models/language/pooling_mteb_test/test_cross_encoder.py
...s/models/language/pooling_mteb_test/test_cross_encoder.py
+5
-2
tests/models/language/pooling_mteb_test/test_gte.py
tests/models/language/pooling_mteb_test/test_gte.py
+20
-14
tests/models/language/pooling_mteb_test/test_intfloat.py
tests/models/language/pooling_mteb_test/test_intfloat.py
+7
-3
tests/models/language/pooling_mteb_test/test_jina.py
tests/models/language/pooling_mteb_test/test_jina.py
+7
-4
tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+2
-1
tests/models/language/pooling_mteb_test/test_nomic.py
tests/models/language/pooling_mteb_test/test_nomic.py
+6
-2
tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
.../models/language/pooling_mteb_test/test_qwen3_reranker.py
+2
-1
tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
...language/pooling_mteb_test/test_snowflake_arctic_embed.py
+10
-3
tests/models/language/pooling_mteb_test/test_st_projector.py
tests/models/language/pooling_mteb_test/test_st_projector.py
+8
-1
tests/models/multimodal/generation/test_pixtral.py
tests/models/multimodal/generation/test_pixtral.py
+26
-27
tests/models/multimodal/generation/test_qwen2_vl.py
tests/models/multimodal/generation/test_qwen2_vl.py
+2
-2
tests/models/multimodal/generation/test_whisper.py
tests/models/multimodal/generation/test_whisper.py
+1
-2
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
tests/models/language/generation_ppl_test/test_gpt.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
tests.models.utils
import
GenerateModelInfo
from
.ppl_utils
import
wikitext_ppl_test
MODELS
=
[
GenerateModelInfo
(
"openai-community/gpt2-large"
)]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_ppl
(
hf_runner
,
vllm_runner
,
model_info
:
GenerateModelInfo
):
wikitext_ppl_test
(
hf_runner
,
vllm_runner
,
model_info
)
tests/models/language/generation_ppl_test/test_qwen.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
tests.models.utils
import
GenerateModelInfo
from
.ppl_utils
import
wikitext_ppl_test
MODELS
=
[
GenerateModelInfo
(
"Qwen/Qwen3-0.6B"
),
GenerateModelInfo
(
"Qwen/Qwen3-0.6B-FP8"
),
# transformers:
# Loading a GPTQ quantized model requires optimum, gptqmodel
# GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"),
]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_ppl
(
hf_runner
,
vllm_runner
,
model_info
:
GenerateModelInfo
):
wikitext_ppl_test
(
hf_runner
,
vllm_runner
,
model_info
)
tests/models/language/pooling/embed_utils.py
View file @
38d80967
...
...
@@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner,
example_prompts
,
vllm_extra_kwargs
=
None
,
hf_model_callback
=
None
):
if
not
model_info
.
enable_test
:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest
.
skip
(
"Skipping test."
)
pytest
.
skip
(
"Debug only, ci prefers to use mteb test."
)
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
...
...
@@ -62,7 +59,7 @@ def correctness_test_embed_models(hf_runner,
with
hf_runner
(
model_info
.
name
,
dtype
=
"float32"
,
dtype
=
model_info
.
hf_dtype
,
is_sentence_transformer
=
True
,
)
as
hf_model
:
...
...
tests/models/language/pooling/test_embedding.py
View file @
38d80967
...
...
@@ -7,7 +7,7 @@ import pytest
from
vllm.config
import
PoolerConfig
from
vllm.platforms
import
current_platform
from
...utils
import
check_embeddings_close
,
check_transformers_version
from
...utils
import
check_embeddings_close
@
pytest
.
mark
.
parametrize
(
...
...
@@ -27,12 +27,17 @@ from ...utils import check_embeddings_close, check_transformers_version
pytest
.
param
(
"ssmits/Qwen2-7B-Instruct-embed-base"
,
marks
=
[
pytest
.
mark
.
cpu_model
]),
# [Encoder-only]
pytest
.
param
(
"BAAI/bge-base-en-v1.5"
,
marks
=
[
pytest
.
mark
.
core_model
]),
pytest
.
param
(
"BAAI/bge-base-en-v1.5"
,
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
pytest
.
param
(
"sentence-transformers/all-MiniLM-L12-v2"
),
pytest
.
param
(
"intfloat/multilingual-e5-small"
),
pytest
.
param
(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
),
# [Cross-Encoder]
pytest
.
param
(
"sentence-transformers/stsb-roberta-base-v2"
),
pytest
.
param
(
"sentence-transformers/stsb-roberta-base-v2"
,
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
],
)
def
test_models
(
...
...
@@ -42,8 +47,6 @@ def test_models(
model
,
monkeypatch
,
)
->
None
:
if
model
==
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
:
check_transformers_version
(
model
,
max_transformers_version
=
"4.53.2"
)
if
model
==
"BAAI/bge-multilingual-gemma2"
and
current_platform
.
is_rocm
():
# ROCm Triton FA does not currently support sliding window attention
...
...
tests/models/language/pooling_mteb_test/__init__.py
0 → 100644
View file @
38d80967
tests/models/language/pooling/mteb_utils.py
→
tests/models/language/pooling
_mteb_test
/mteb_utils.py
View file @
38d80967
...
...
@@ -9,8 +9,10 @@ import mteb
import
numpy
as
np
import
pytest
import
requests
import
torch
from
tests.models.utils
import
EmbedModelInfo
,
RerankModelInfo
from
tests.models.utils
import
(
EmbedModelInfo
,
RerankModelInfo
,
check_embeddings_close
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
...
...
@@ -18,7 +20,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS
=
[
"STS12"
]
MTEB_EMBED_TOL
=
0.02
MTEB_EMBED_TOL
=
1e-4
# See #19344
MTEB_RERANK_TASKS
=
[
"NFCorpus"
]
...
...
@@ -163,15 +165,20 @@ def mteb_test_embed_models(hf_runner,
model_info
:
EmbedModelInfo
,
vllm_extra_kwargs
=
None
,
hf_model_callback
=
None
,
atol
=
MTEB_RERANK_TOL
):
atol
=
MTEB_EMBED_TOL
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if
not
model_info
.
enable_test
:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest
.
skip
(
"Skipping test."
)
# Test embed_dims, isnan and whether to use normalize
example_prompts
=
[
"The chef prepared a delicious meal."
*
1000
]
# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs
=
vllm_extra_kwargs
or
{}
vllm_extra_kwargs
[
"dtype"
]
=
model_info
.
dtype
# Allow vllm to test using hf_overrides
if
model_info
.
hf_overrides
is
not
None
:
vllm_extra_kwargs
[
"hf_overrides"
]
=
model_info
.
hf_overrides
...
...
@@ -183,8 +190,12 @@ def mteb_test_embed_models(hf_runner,
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
# Confirm whether vllm is using the correct architecture
if
model_info
.
architecture
:
assert
model_info
.
architecture
in
model_config
.
architectures
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert
(
model_config
.
_model_info
.
default_pooling_type
==
model_info
.
default_pooling_type
)
...
...
@@ -192,22 +203,46 @@ def mteb_test_embed_models(hf_runner,
MTEB_EMBED_TASKS
)
vllm_dtype
=
vllm_model
.
llm
.
llm_engine
.
model_config
.
dtype
with
hf_runner
(
model_info
.
name
,
is_sentence_transformer
=
True
,
dtype
=
"float32"
)
as
hf_model
:
if
hf_model_callback
is
not
None
:
hf_model_callback
(
hf_model
)
st_main_score
=
run_mteb_embed_task
(
hf_model
,
MTEB_EMBED_TASKS
)
st_dtype
=
next
(
hf_model
.
model
.
parameters
()).
dtype
# Test embed_dims, isnan and whether to use normalize
vllm_outputs
=
vllm_model
.
embed
(
example_prompts
,
truncate_prompt_tokens
=-
1
)
assert
not
torch
.
any
(
torch
.
isnan
(
torch
.
tensor
(
vllm_outputs
)))
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if
model_info
.
mteb_score
is
None
:
with
hf_runner
(
model_info
.
name
,
is_sentence_transformer
=
True
,
dtype
=
model_info
.
hf_dtype
)
as
hf_model
:
# e.g. setting default parameters for the encode method of hf_runner
if
hf_model_callback
is
not
None
:
hf_model_callback
(
hf_model
)
st_main_score
=
run_mteb_embed_task
(
hf_model
,
MTEB_EMBED_TASKS
)
st_dtype
=
next
(
hf_model
.
model
.
parameters
()).
dtype
# Test embed_dims and whether to use normalize
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
else
:
st_main_score
=
model_info
.
mteb_score
st_dtype
=
"Constant"
print
(
"Model:"
,
model_info
.
name
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_main_score
)
print
(
"SentenceTransformers:"
,
st_dtype
,
st_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
atol
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
atol
def
run_mteb_rerank
(
cross_encoder
,
tasks
,
languages
):
...
...
@@ -243,9 +278,12 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
return
main_score
def
mteb_test_rerank_models_hf
(
hf_runner
,
model_name
,
hf_model_callback
=
None
):
def
mteb_test_rerank_models_hf
(
hf_runner
,
model_name
,
hf_dtype
=
"float32"
,
hf_model_callback
=
None
):
with
hf_runner
(
model_name
,
is_cross_encoder
=
True
,
dtype
=
"float32"
)
as
hf_model
:
dtype
=
hf_dtype
)
as
hf_model
:
original_predict
=
hf_model
.
predict
...
...
@@ -279,14 +317,16 @@ def mteb_test_rerank_models(hf_runner,
hf_model_callback
=
None
,
vllm_mteb_encoder
=
VllmMtebEncoder
,
atol
=
MTEB_RERANK_TOL
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if
not
model_info
.
enable_test
:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest
.
skip
(
"Skipping test."
)
# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs
=
vllm_extra_kwargs
or
{}
vllm_extra_kwargs
[
"dtype"
]
=
model_info
.
dtype
# Allow vllm to test using hf_overrides
if
model_info
.
hf_overrides
is
not
None
:
vllm_extra_kwargs
[
"hf_overrides"
]
=
model_info
.
hf_overrides
...
...
@@ -299,9 +339,15 @@ def mteb_test_rerank_models(hf_runner,
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
# Confirm whether vllm is using the correct architecture
if
model_info
.
architecture
:
assert
(
model_info
.
architecture
in
model_config
.
architectures
)
# Score API is only enabled for num_labels == 1
assert
model_config
.
hf_config
.
num_labels
==
1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert
(
model_config
.
_model_info
.
default_pooling_type
==
model_info
.
default_pooling_type
)
...
...
@@ -310,12 +356,20 @@ def mteb_test_rerank_models(hf_runner,
languages
=
MTEB_RERANK_LANGS
)
vllm_dtype
=
model_config
.
dtype
st_main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
hf_runner
,
model_info
.
name
,
hf_model_callback
)
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if
model_info
.
mteb_score
is
None
:
st_main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
hf_runner
,
model_info
.
name
,
model_info
.
hf_dtype
,
hf_model_callback
)
else
:
st_main_score
=
model_info
.
mteb_score
st_dtype
=
"Constant"
print
(
"Model:"
,
model_info
.
name
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_main_score
)
print
(
"SentenceTransformers:"
,
st_dtype
,
st_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
atol
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
atol
tests/models/language/pooling/test_baai.py
→
tests/models/language/pooling
_mteb_test
/test_baai.py
View file @
38d80967
...
...
@@ -2,16 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
...utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
RerankModelInfo
)
from
.embed_utils
import
correctness_test_embed_models
from
tests.models.language.pooling.embed_utils
import
(
correctness_test_embed_models
)
from
tests.models.utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
RerankModelInfo
)
from
.mteb_utils
import
mteb_test_embed_models
,
mteb_test_rerank_models
MODELS
=
[
########## BertModel
CLSPoolingEmbedModelInfo
(
"BAAI/bge-base-en"
,
architecture
=
"BertModel"
,
mteb_score
=
0.779336792
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"BAAI/bge-base-zh"
,
architecture
=
"BertModel"
,
...
...
@@ -52,10 +55,12 @@ MODELS = [
########## XLMRobertaModel
CLSPoolingEmbedModelInfo
(
"BAAI/bge-m3"
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.787343078
,
enable_test
=
True
),
########## Qwen2Model
LASTPoolingEmbedModelInfo
(
"BAAI/bge-code-v1"
,
architecture
=
"Qwen2Model"
,
mteb_score
=
0.75724465
,
dtype
=
"float32"
,
enable_test
=
True
),
]
...
...
@@ -65,6 +70,7 @@ RERANK_MODELS = [
CLSPoolingRerankModelInfo
(
"BAAI/bge-reranker-base"
,
architecture
=
"XLMRobertaForSequenceClassification"
,
mteb_score
=
0.32398
,
enable_test
=
True
),
CLSPoolingRerankModelInfo
(
"BAAI/bge-reranker-large"
,
...
...
tests/models/language/pooling/test_bge_reranker_v2_gemma.py
→
tests/models/language/pooling
_mteb_test
/test_bge_reranker_v2_gemma.py
View file @
38d80967
...
...
@@ -7,13 +7,14 @@ import pytest
import
torch
from
tests.conftest
import
HfRunner
from
...utils
import
LASTPoolingRerankModelInfo
,
R
erank
M
odel
Info
from
.mteb_utils
import
VllmMtebEncoder
,
mteb_test_r
erank
_m
odel
s
from
tests.models.language.pooling_mteb_test.mteb_utils
import
(
VllmMtebEncoder
,
mteb_test_r
erank
_m
odel
s
)
from
tests.models.utils
import
LASTPoolingRerankModelInfo
,
R
erank
M
odel
Info
RERANK_MODELS
=
[
LASTPoolingRerankModelInfo
(
"BAAI/bge-reranker-v2-gemma"
,
architecture
=
"GemmaForSequenceClassification"
,
mteb_score
=
0.33757
,
hf_overrides
=
{
"architectures"
:
[
"GemmaForSequenceClassification"
],
...
...
@@ -104,7 +105,6 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
prompt
=
PROMPT
self
.
query_template
=
"A: {query}
\n
"
self
.
document_template
=
"B: {doc}
\n
{prompt}"
...
...
@@ -119,7 +119,7 @@ class GemmaMtebEncoder(VllmMtebEncoder):
_sentences
=
[]
for
query
,
corpus
,
prompt
in
sentences
:
query
=
self
.
query_template
.
format
(
query
=
query
)
corpus
=
self
.
document_template
.
format
(
doc
=
corpus
,
prompt
=
prompt
)
corpus
=
self
.
document_template
.
format
(
doc
=
corpus
,
prompt
=
PROMPT
)
_sentences
.
append
((
query
,
corpus
,
prompt
))
return
super
().
predict
(
_sentences
,
*
args
,
**
kwargs
)
...
...
tests/models/language/pooling/test_cross_encoder.py
→
tests/models/language/pooling
_mteb_test
/test_cross_encoder.py
View file @
38d80967
...
...
@@ -2,14 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
...utils
import
(
CLSPoolingRerankModelInfo
,
LASTPoolingRerankModelInfo
,
RerankModelInfo
)
from
tests.models.utils
import
(
CLSPoolingRerankModelInfo
,
LASTPoolingRerankModelInfo
,
RerankModelInfo
)
from
.mteb_utils
import
mteb_test_rerank_models
RERANK_MODELS
=
[
CLSPoolingRerankModelInfo
(
"cross-encoder/ms-marco-TinyBERT-L-2-v2"
,
mteb_score
=
0.32898
,
architecture
=
"BertForSequenceClassification"
),
LASTPoolingRerankModelInfo
(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
,
mteb_score
=
0.25736
,
architecture
=
"Qwen3ForSequenceClassification"
)
]
...
...
tests/models/language/pooling/test_gte.py
→
tests/models/language/pooling
_mteb_test
/test_gte.py
View file @
38d80967
...
...
@@ -3,15 +3,18 @@
import
pytest
from
...utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
RerankModelInfo
,
check_transformers_version
)
from
.embed_utils
import
correctness_test_embed_models
from
tests.models.language.pooling.embed_utils
import
(
correctness_test_embed_models
)
from
tests.models.utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
RerankModelInfo
)
from
.mteb_utils
import
mteb_test_embed_models
,
mteb_test_rerank_models
MODELS
=
[
########## BertModel
CLSPoolingEmbedModelInfo
(
"thenlper/gte-large"
,
mteb_score
=
0.76807651
,
architecture
=
"BertModel"
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"thenlper/gte-base"
,
...
...
@@ -30,28 +33,37 @@ MODELS = [
architecture
=
"BertModel"
,
enable_test
=
False
),
########### NewModel
# These three architectures are almost the same, but not exactly the same.
# For example,
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-multilingual-base"
,
architecture
=
"GteNewModel"
,
mteb_score
=
0.775074696
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-base-en-v1.5"
,
architecture
=
"GteNewModel"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
enable_test
=
Tru
e
),
enable_test
=
Fals
e
),
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-large-en-v1.5"
,
architecture
=
"GteNewModel"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
enable_test
=
Tru
e
),
enable_test
=
Fals
e
),
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
,
mteb_score
=
0.758473459018872
,
architecture
=
"Qwen2ForCausalLM"
,
enable_test
=
True
),
########## ModernBertModel
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
mteb_score
=
0.748193353
,
architecture
=
"ModernBertModel"
,
enable_test
=
True
),
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo
(
"Qwen/Qwen3-Embedding-0.6B"
,
mteb_score
=
0.771163695
,
architecture
=
"Qwen3ForCausalLM"
,
dtype
=
"float32"
,
enable_test
=
True
),
...
...
@@ -65,10 +77,12 @@ RERANK_MODELS = [
CLSPoolingRerankModelInfo
(
# classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base"
,
mteb_score
=
0.33386
,
architecture
=
"ModernBertForSequenceClassification"
,
enable_test
=
True
),
CLSPoolingRerankModelInfo
(
"Alibaba-NLP/gte-multilingual-reranker-base"
,
mteb_score
=
0.33062
,
architecture
=
"GteNewForSequenceClassification"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewForSequenceClassification"
]},
enable_test
=
True
),
...
...
@@ -78,10 +92,6 @@ RERANK_MODELS = [
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
if
model_info
.
name
==
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
:
check_transformers_version
(
model_info
.
name
,
max_transformers_version
=
"4.53.2"
)
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
)
...
...
@@ -89,10 +99,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
def
test_embed_models_correctness
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
,
example_prompts
)
->
None
:
if
model_info
.
name
==
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
:
check_transformers_version
(
model_info
.
name
,
max_transformers_version
=
"4.53.2"
)
correctness_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
,
example_prompts
)
...
...
tests/models/language/pooling/test_intfloat.py
→
tests/models/language/pooling
_mteb_test
/test_intfloat.py
View file @
38d80967
...
...
@@ -2,14 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
...utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
.embed_utils
import
correctness_test_embed_models
from
tests.models.language.pooling.embed_utils
import
(
correctness_test_embed_models
)
from
tests.models.utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
.mteb_utils
import
mteb_test_embed_models
MODELS
=
[
########## BertModel
CLSPoolingEmbedModelInfo
(
"intfloat/e5-small"
,
architecture
=
"BertModel"
,
mteb_score
=
0.742285423
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"intfloat/e5-base"
,
architecture
=
"BertModel"
,
...
...
@@ -23,6 +26,7 @@ MODELS = [
########## XLMRobertaModel
CLSPoolingEmbedModelInfo
(
"intfloat/multilingual-e5-base"
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.779325955
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"intfloat/multilingual-e5-large"
,
architecture
=
"XLMRobertaModel"
,
...
...
@@ -36,7 +40,7 @@ MODELS = [
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
,
atol
=
0.02
)
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
...
...
tests/models/language/pooling/test_jina.py
→
tests/models/language/pooling
_mteb_test
/test_jina.py
View file @
38d80967
...
...
@@ -4,16 +4,18 @@ from functools import partial
import
pytest
from
tests.models.language.pooling.embed_utils
import
(
check_embeddings_close
,
correctness_test_embed_models
,
matryoshka_fy
)
from
tests.models.utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
RerankModelInfo
)
from
vllm
import
PoolingParams
from
...utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
RerankModelInfo
)
from
.embed_utils
import
(
check_embeddings_close
,
correctness_test_embed_models
,
matryoshka_fy
)
from
.mteb_utils
import
mteb_test_embed_models
,
mteb_test_rerank_models
EMBEDDING_MODELS
=
[
CLSPoolingEmbedModelInfo
(
"jinaai/jina-embeddings-v3"
,
mteb_score
=
0.824413164
,
architecture
=
"XLMRobertaModel"
,
is_matryoshka
=
True
)
]
...
...
@@ -21,6 +23,7 @@ EMBEDDING_MODELS = [
RERANK_MODELS
=
[
CLSPoolingRerankModelInfo
(
"jinaai/jina-reranker-v2-base-multilingual"
,
mteb_score
=
0.33643
,
architecture
=
"XLMRobertaForSequenceClassification"
)
]
...
...
tests/models/language/pooling/test_mxbai_rerank.py
→
tests/models/language/pooling
_mteb_test
/test_mxbai_rerank.py
View file @
38d80967
...
...
@@ -6,8 +6,8 @@ import pytest
import
torch
from
tests.conftest
import
HfRunner
from
tests.models.utils
import
LASTPoolingRerankModelInfo
,
RerankModelInfo
from
...utils
import
LASTPoolingRerankModelInfo
,
RerankModelInfo
from
.mteb_utils
import
mteb_test_rerank_models
mxbai_rerank_hf_overrides
=
{
...
...
@@ -20,6 +20,7 @@ RERANK_MODELS = [
LASTPoolingRerankModelInfo
(
"mixedbread-ai/mxbai-rerank-base-v2"
,
architecture
=
"Qwen2ForSequenceClassification"
,
hf_overrides
=
mxbai_rerank_hf_overrides
,
mteb_score
=
0.273
,
enable_test
=
True
),
LASTPoolingRerankModelInfo
(
"mixedbread-ai/mxbai-rerank-large-v2"
,
architecture
=
"Qwen2ForSequenceClassification"
,
...
...
tests/models/language/pooling/test_nomic.py
→
tests/models/language/pooling
_mteb_test
/test_nomic.py
View file @
38d80967
...
...
@@ -3,13 +3,16 @@
import
pytest
from
...utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
.embed_utils
import
correctness_test_embed_models
from
tests.models.language.pooling.embed_utils
import
(
correctness_test_embed_models
)
from
tests.models.utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
.mteb_utils
import
mteb_test_embed_models
MODELS
=
[
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v1"
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.737568559
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v1.5"
,
architecture
=
"NomicBertModel"
,
...
...
@@ -19,6 +22,7 @@ MODELS = [
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v2-moe"
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.715488912
,
enable_test
=
True
)
]
...
...
tests/models/language/pooling/test_qwen3_reranker.py
→
tests/models/language/pooling
_mteb_test
/test_qwen3_reranker.py
View file @
38d80967
...
...
@@ -6,9 +6,9 @@ import pytest
import
torch
from
tests.conftest
import
HfRunner
from
tests.models.utils
import
LASTPoolingRerankModelInfo
,
RerankModelInfo
from
tests.utils
import
multi_gpu_test
from
...utils
import
LASTPoolingRerankModelInfo
,
RerankModelInfo
from
.mteb_utils
import
mteb_test_rerank_models
qwen3_reranker_hf_overrides
=
{
...
...
@@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = {
RERANK_MODELS
=
[
LASTPoolingRerankModelInfo
(
"Qwen/Qwen3-Reranker-0.6B"
,
architecture
=
"Qwen3ForSequenceClassification"
,
mteb_score
=
0.25736
,
hf_overrides
=
qwen3_reranker_hf_overrides
,
enable_test
=
True
),
LASTPoolingRerankModelInfo
(
"Qwen/Qwen3-Reranker-4B"
,
...
...
tests/models/language/pooling/test_snowflake_arctic_embed.py
→
tests/models/language/pooling
_mteb_test
/test_snowflake_arctic_embed.py
View file @
38d80967
...
...
@@ -3,14 +3,17 @@
import
pytest
from
...utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
.embed_utils
import
correctness_test_embed_models
from
tests.models.language.pooling.embed_utils
import
(
correctness_test_embed_models
)
from
tests.models.utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
.mteb_utils
import
mteb_test_embed_models
MODELS
=
[
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-xs"
,
is_matryoshka
=
False
,
architecture
=
"BertModel"
,
mteb_score
=
0.714927797
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-s"
,
is_matryoshka
=
False
,
...
...
@@ -23,6 +26,7 @@ MODELS = [
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-long"
,
is_matryoshka
=
False
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.681146831
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-l"
,
is_matryoshka
=
False
,
...
...
@@ -31,14 +35,17 @@ MODELS = [
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v1.5"
,
is_matryoshka
=
True
,
architecture
=
"BertModel"
,
mteb_score
=
0.649088363
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-l-v2.0"
,
is_matryoshka
=
True
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.712258299
,
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
is_matryoshka
=
True
,
architecture
=
"GteModel"
,
mteb_score
=
0.706622444
,
enable_test
=
True
),
]
...
...
@@ -46,7 +53,7 @@ MODELS = [
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
,
atol
=
0.02
)
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
...
...
tests/models/language/pooling/test_st_projector.py
→
tests/models/language/pooling
_mteb_test
/test_st_projector.py
View file @
38d80967
...
...
@@ -2,7 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
...utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
tests.models.utils
import
(
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
)
from
.mteb_utils
import
mteb_test_embed_models
# ST models with projector (Dense) layers
...
...
@@ -10,8 +12,13 @@ ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo
(
"TencentBAC/Conan-embedding-v1"
,
architecture
=
"BertModel"
,
mteb_score
=
0.688611955
,
enable_test
=
True
,
),
LASTPoolingEmbedModelInfo
(
"google/embeddinggemma-300m"
,
architecture
=
"Gemma3TextModel"
,
mteb_score
=
0.7473819294684156
,
enable_test
=
True
)
]
...
...
tests/models/multimodal/generation/test_pixtral.py
View file @
38d80967
...
...
@@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS
=
[
PIXTRAL_ID
,
MISTRAL_SMALL_3_1_ID
]
IMG_URLS
=
[
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg"
,
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/23
1-2
00x300.jpg"
,
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-
5
00x
5
00.jpg"
,
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/
17-15
0x
6
00.jpg"
,
"237-400x300.jpg"
,
#
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"231-200x300.jpg"
,
#
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/23
7-4
00x300.jpg",
"27-500x500.jpg"
,
#
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/2
3
7-
4
00x
3
00.jpg",
"17-150x600.jpg"
,
#
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/
237-40
0x
3
00.jpg",
]
PROMPT
=
"Describe each image in one short sentence."
...
...
@@ -105,12 +105,6 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
return
engine_inputs
MSGS
=
[
_create_msg_format
(
IMG_URLS
[:
1
]),
_create_msg_format
(
IMG_URLS
[:
2
]),
_create_msg_format
(
IMG_URLS
),
]
SAMPLING_PARAMS
=
SamplingParams
(
max_tokens
=
512
,
temperature
=
0.0
,
logprobs
=
5
)
LIMIT_MM_PER_PROMPT
=
dict
(
image
=
4
)
...
...
@@ -156,12 +150,8 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
MAX_MODEL_LEN
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
def
test_chat
(
vllm_runner
,
max_model_len
:
int
,
model
:
str
,
dtype
:
str
,
)
->
None
:
def
test_chat
(
vllm_runner
,
max_model_len
:
int
,
model
:
str
,
dtype
:
str
,
local_asset_server
)
->
None
:
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
[
model
])
with
vllm_runner
(
...
...
@@ -174,7 +164,14 @@ def test_chat(
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
outputs
=
[]
for
msg
in
MSGS
:
urls_all
=
[
local_asset_server
.
url_for
(
u
)
for
u
in
IMG_URLS
]
msgs
=
[
_create_msg_format
(
urls_all
[:
1
]),
_create_msg_format
(
urls_all
[:
2
]),
_create_msg_format
(
urls_all
),
]
for
msg
in
msgs
:
output
=
vllm_model
.
llm
.
chat
(
msg
,
sampling_params
=
SAMPLING_PARAMS
)
outputs
.
extend
(
output
)
...
...
@@ -190,17 +187,19 @@ def test_chat(
name_1
=
"output"
)
@
pytest
.
mark
.
parametrize
(
"prompt,expected_ranges"
,
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
])
,
[
PlaceholderRange
(
offset
=
11
,
length
=
494
)]),
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]
)
,
[
PlaceholderRange
(
offset
=
11
,
length
=
266
),
PlaceholderRange
(
offset
=
277
,
length
=
1056
),
PlaceholderRange
(
offset
=
1333
,
length
=
418
)
])])
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
:
TextPrompt
,
@
pytest
.
mark
.
parametrize
(
"image_urls,expected_ranges"
,
[(
IMG_URLS
[:
1
],
[
PlaceholderRange
(
offset
=
11
,
length
=
494
)]),
(
IMG_URLS
[
1
:
4
],
[
PlaceholderRange
(
offset
=
11
,
length
=
266
),
PlaceholderRange
(
offset
=
277
,
length
=
1056
),
PlaceholderRange
(
offset
=
1333
,
length
=
418
)
])])
def
test_multi_modal_placeholders
(
vllm_runner
,
image_urls
:
list
[
str
]
,
expected_ranges
:
list
[
PlaceholderRange
],
monkeypatch
)
->
None
:
local_asset_server
,
monkeypatch
)
->
None
:
local_image_urls
=
[
local_asset_server
.
url_for
(
u
)
for
u
in
image_urls
]
prompt
=
_create_engine_inputs_hf
(
local_image_urls
)
# This placeholder checking test only works with V0 engine
# where `multi_modal_placeholders` is returned with `RequestOutput`
...
...
tests/models/multimodal/generation/test_qwen2_vl.py
View file @
38d80967
...
...
@@ -154,7 +154,7 @@ def batch_make_image_embeddings(
embed_counter
+=
cur_batch_embed_len
image_counter
+=
cur_batch_image_count
# ensure we don't los
t
any images or embeddings
# ensure we don't los
e
any images or embeddings
assert
embed_counter
==
image_embeds
.
size
(
0
)
assert
image_counter
==
image_grid_thw
.
size
(
0
)
assert
len
(
image_batches
)
==
len
(
result
)
...
...
@@ -238,7 +238,7 @@ def batch_make_video_embeddings(
embed_counter
+=
cur_batch_embed_len
video_counter
+=
cur_batch_video_count
# ensure we don't los
t
any videos or embeddings
# ensure we don't los
e
any videos or embeddings
assert
embed_counter
==
video_embeds
.
size
(
0
)
assert
video_counter
==
video_grid_thw
.
size
(
0
)
assert
len
(
video_batches
)
==
len
(
result
)
...
...
tests/models/multimodal/generation/test_whisper.py
View file @
38d80967
...
...
@@ -122,8 +122,7 @@ def run_test(
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"openai/whisper-small"
,
"openai/whisper-large-v3-turbo"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"openai/whisper-large-v3-turbo"
])
@
create_new_process_for_each_test
()
def
test_models
(
vllm_runner
,
model
)
->
None
:
run_test
(
...
...
Prev
1
…
10
11
12
13
14
15
16
17
18
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment