Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
553 additions
and
254 deletions
+553
-254
tests/models/language/pooling_mteb_test/test_baai.py
tests/models/language/pooling_mteb_test/test_baai.py
+38
-37
tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
.../language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+34
-43
tests/models/language/pooling_mteb_test/test_cross_encoder.py
...s/models/language/pooling_mteb_test/test_cross_encoder.py
+16
-11
tests/models/language/pooling_mteb_test/test_gte.py
tests/models/language/pooling_mteb_test/test_gte.py
+47
-31
tests/models/language/pooling_mteb_test/test_intfloat.py
tests/models/language/pooling_mteb_test/test_intfloat.py
+17
-13
tests/models/language/pooling_mteb_test/test_jina.py
tests/models/language/pooling_mteb_test/test_jina.py
+14
-10
tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+52
-26
tests/models/language/pooling_mteb_test/test_nemotron.py
tests/models/language/pooling_mteb_test/test_nemotron.py
+50
-0
tests/models/language/pooling_mteb_test/test_nomic.py
tests/models/language/pooling_mteb_test/test_nomic.py
+14
-6
tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
.../models/language/pooling_mteb_test/test_qwen3_reranker.py
+59
-28
tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
...language/pooling_mteb_test/test_snowflake_arctic_embed.py
+30
-10
tests/models/language/pooling_mteb_test/test_st_projector.py
tests/models/language/pooling_mteb_test/test_st_projector.py
+11
-6
tests/models/multimodal/conftest.py
tests/models/multimodal/conftest.py
+4
-7
tests/models/multimodal/generation/test_common.py
tests/models/multimodal/generation/test_common.py
+59
-6
tests/models/multimodal/generation/test_granite_speech.py
tests/models/multimodal/generation/test_granite_speech.py
+10
-4
tests/models/multimodal/generation/test_keye.py
tests/models/multimodal/generation/test_keye.py
+2
-5
tests/models/multimodal/generation/test_nemotron_parse.py
tests/models/multimodal/generation/test_nemotron_parse.py
+89
-0
tests/models/multimodal/generation/test_qwen2_vl.py
tests/models/multimodal/generation/test_qwen2_vl.py
+1
-1
tests/models/multimodal/generation/test_vit_backend_functionality.py
...s/multimodal/generation/test_vit_backend_functionality.py
+5
-10
tests/models/multimodal/generation/test_voxtral.py
tests/models/multimodal/generation/test_voxtral.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/models/language/pooling_mteb_test/test_baai.py
View file @
7e63ef82
...
...
@@ -4,90 +4,93 @@ import pytest
from
tests.models.language.pooling.embed_utils
import
correctness_test_embed_models
from
tests.models.utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
RerankModelInfo
,
)
from
.mteb_utils
import
mteb_test_embed_models
,
mteb_test_rerank_models
from
.mteb_embed_utils
import
mteb_test_embed_models
from
.mteb_score_utils
import
mteb_test_rerank_models
MODELS
=
[
########## BertModel
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-base-en"
,
architecture
=
"BertModel"
,
mteb_score
=
0.779336792
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPoolingEmbedModelInfo
(
"BAAI/bge-base-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"BAAI/bge-small-en"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"BAAI/bge-small-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"BAAI/bge-large-en"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"BAAI/bge-large-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-base-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"BAAI/bge-small-en"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"BAAI/bge-small-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"BAAI/bge-large-en"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"BAAI/bge-large-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"BAAI/bge-large-zh-noinstruct"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-base-en-v1.5"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-base-zh-v1.5"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-small-en-v1.5"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-small-zh-v1.5"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-large-en-v1.5"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-large-zh-v1.5"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
########## XLMRobertaModel
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-m3"
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.787343078
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
########## Qwen2Model
LASTPooling
EmbedModelInfo
(
EmbedModelInfo
(
"BAAI/bge-code-v1"
,
architecture
=
"Qwen2Model"
,
mteb_score
=
0.75724465
,
dtype
=
"float32"
,
seq_pooling_type
=
"LAST"
,
attn_type
=
"decoder"
,
is_prefix_caching_supported
=
True
,
is_chunked_prefill_supported
=
True
,
enable_test
=
True
,
),
]
RERANK_MODELS
=
[
########## XLMRobertaForSequenceClassification
CLSPooling
RerankModelInfo
(
RerankModelInfo
(
"BAAI/bge-reranker-base"
,
architecture
=
"XLMRobertaForSequenceClassification"
,
mteb_score
=
0.32398
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPooling
RerankModelInfo
(
RerankModelInfo
(
"BAAI/bge-reranker-large"
,
architecture
=
"XLMRobertaForSequenceClassification"
,
enable_test
=
False
,
),
CLSPooling
RerankModelInfo
(
RerankModelInfo
(
"BAAI/bge-reranker-v2-m3"
,
architecture
=
"XLMRobertaForSequenceClassification"
,
enable_test
=
False
,
...
...
@@ -108,7 +111,5 @@ def test_embed_models_correctness(
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
hf_runner
,
vllm_runner
,
model_info
)
def
test_rerank_models_mteb
(
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
vllm_runner
,
model_info
)
tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
View file @
7e63ef82
...
...
@@ -9,40 +9,62 @@ import torch
from
torch.utils.data
import
DataLoader
from
tests.conftest
import
HfRunner
from
tests.models.language.pooling_mteb_test.mteb_utils
import
(
VllmMtebCrossEncoder
,
from
tests.models.utils
import
RerankModelInfo
from
.mteb_score_utils
import
(
MtebCrossEncoderMixin
,
mteb_test_rerank_models
,
)
from
tests.models.utils
import
LASTPoolingRerankModelInfo
,
RerankModelInfo
RERANK_MODELS
=
[
LASTPooling
RerankModelInfo
(
RerankModelInfo
(
"BAAI/bge-reranker-v2-gemma"
,
architecture
=
"GemmaForSequenceClassification"
,
mteb_score
=
0.33757
,
hf_overrides
=
{
"architectures"
:
[
"GemmaForSequenceClassification"
],
"classifier_from_token"
:
[
"Yes"
],
"method"
:
"no_post_processing"
,
},
mteb_score
=
0.33757
,
seq_pooling_type
=
"LAST"
,
attn_type
=
"decoder"
,
is_prefix_caching_supported
=
True
,
is_chunked_prefill_supported
=
True
,
chat_template_name
=
"bge-reranker-v2-gemma.jinja"
,
),
]
PROMPT
=
"Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
# noqa: E501
class
GemmaRerankerHfRunner
(
HfRunner
):
class
GemmaRerankerHfRunner
(
MtebCrossEncoderMixin
,
HfRunner
):
def
__init__
(
self
,
model_name
:
str
,
dtype
:
str
=
"auto"
,
*
args
:
Any
,
**
kwargs
:
Any
)
->
None
:
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
super
().
__init__
(
model_name
,
dtype
,
auto_cls
=
AutoModelForCausalLM
)
HfRunner
.
__init__
(
self
,
model_name
=
model_name
,
auto_cls
=
AutoModelForCausalLM
,
dtype
=
dtype
,
**
kwargs
,
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
padding_side
=
"left"
)
self
.
yes_loc
=
self
.
tokenizer
.
convert_tokens_to_ids
(
"Yes"
)
@
torch
.
no_grad
()
def
predict
(
self
,
prompts
:
list
[
list
[
str
]],
*
args
,
**
kwargs
)
->
torch
.
Tensor
:
@
torch
.
no_grad
def
predict
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
text
for
batch
in
inputs1
for
text
in
batch
[
"text"
]]
corpus
=
[
text
for
batch
in
inputs2
for
text
in
batch
[
"text"
]]
def
get_inputs
(
pairs
,
tokenizer
,
prompt
=
None
):
if
prompt
is
None
:
prompt
=
PROMPT
...
...
@@ -87,8 +109,8 @@ class GemmaRerankerHfRunner(HfRunner):
)
scores
=
[]
for
query
,
doc
,
*
_
in
prompts
:
pairs
=
[(
query
,
doc
)]
for
query
,
doc
ument
in
zip
(
queries
,
corpus
)
:
pairs
=
[(
query
,
doc
ument
)]
inputs
=
get_inputs
(
pairs
,
self
.
tokenizer
)
inputs
=
inputs
.
to
(
self
.
model
.
device
)
_n_tokens
=
inputs
[
"input_ids"
].
shape
[
1
]
...
...
@@ -105,41 +127,10 @@ class GemmaRerankerHfRunner(HfRunner):
return
torch
.
Tensor
(
scores
)
class
GemmaMtebEncoder
(
VllmMtebCrossEncoder
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
query_template
=
"A: {query}
\n
"
self
.
document_template
=
"B: {doc}
\n
{prompt}"
def
predict
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
self
.
query_template
.
format
(
query
=
text
)
for
batch
in
inputs1
for
text
in
batch
[
"text"
]
]
corpus
=
[
self
.
document_template
.
format
(
doc
=
text
,
prompt
=
PROMPT
)
for
batch
in
inputs2
for
text
in
batch
[
"text"
]
]
outputs
=
self
.
llm
.
score
(
queries
,
corpus
,
truncate_prompt_tokens
=-
1
,
use_tqdm
=
False
)
scores
=
np
.
array
(
outputs
)
return
scores
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
GemmaRerankerHfRunner
,
vllm_runner
,
model_info
,
vllm_mteb_encoder
=
GemmaMtebEncod
er
,
hf_runner
=
GemmaRerankerHfRunn
er
,
)
tests/models/language/pooling_mteb_test/test_cross_encoder.py
View file @
7e63ef82
...
...
@@ -3,29 +3,34 @@
import
pytest
from
tests.models.utils
import
(
CLSPoolingRerankModelInfo
,
LASTPoolingRerankModelInfo
,
RerankModelInfo
,
)
from
.mteb_utils
import
mteb_test_rerank_models
from
.mteb_
score_
utils
import
mteb_test_rerank_models
RERANK_MODELS
=
[
CLSPooling
RerankModelInfo
(
RerankModelInfo
(
"cross-encoder/ms-marco-TinyBERT-L-2-v2"
,
mteb_score
=
0.32898
,
architecture
=
"BertForSequenceClassification"
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
mteb_score
=
0.32898
,
),
LASTPooling
RerankModelInfo
(
RerankModelInfo
(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
,
mteb_score
=
0.25736
,
architecture
=
"Qwen3ForSequenceClassification"
,
seq_pooling_type
=
"LAST"
,
attn_type
=
"decoder"
,
is_prefix_caching_supported
=
True
,
is_chunked_prefill_supported
=
True
,
chat_template_name
=
"qwen3_reranker.jinja"
,
mteb_score
=
0.33459
,
),
]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
hf_runner
,
vllm_runner
,
model_info
)
def
test_rerank_models_mteb
(
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
vllm_runner
,
model_info
)
tests/models/language/pooling_mteb_test/test_gte.py
View file @
7e63ef82
...
...
@@ -5,36 +5,32 @@ import pytest
from
tests.models.language.pooling.embed_utils
import
correctness_test_embed_models
from
tests.models.utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
RerankModelInfo
,
)
from
.mteb_utils
import
mteb_test_embed_models
,
mteb_test_rerank_models
from
.mteb_embed_utils
import
mteb_test_embed_models
from
.mteb_score_utils
import
mteb_test_rerank_models
MODELS
=
[
########## BertModel
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"thenlper/gte-large"
,
mteb_score
=
0.76807651
,
architecture
=
"BertModel"
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPoolingEmbedModelInfo
(
"thenlper/gte-base"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"thenlper/gte-small"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
EmbedModelInfo
(
"thenlper/gte-base"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"thenlper/gte-small"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"thenlper/gte-large-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"thenlper/gte-base-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
EmbedModelInfo
(
"thenlper/gte-base-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"thenlper/gte-small-zh"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
########### NewModel
...
...
@@ -43,68 +39,90 @@ MODELS = [
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Alibaba-NLP/gte-multilingual-base"
,
architecture
=
"GteNewModel"
,
mteb_score
=
0.775074696
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Alibaba-NLP/gte-base-en-v1.5"
,
architecture
=
"GteNewModel"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
enable_test
=
False
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Alibaba-NLP/gte-large-en-v1.5"
,
architecture
=
"GteNewModel"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
enable_test
=
False
,
),
########### Qwen2ForCausalLM
LASTPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
,
mteb_score
=
0.758473459018872
,
architecture
=
"Qwen2ForCausalLM"
,
seq_pooling_type
=
"LAST"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
########## ModernBertModel
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
mteb_score
=
0.748193353
,
architecture
=
"ModernBertModel"
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
########## Qwen3ForCausalLM
LASTPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Qwen/Qwen3-Embedding-0.6B"
,
mteb_score
=
0.771163695
,
architecture
=
"Qwen3ForCausalLM"
,
dtype
=
"float32"
,
seq_pooling_type
=
"LAST"
,
attn_type
=
"decoder"
,
is_prefix_caching_supported
=
True
,
is_chunked_prefill_supported
=
True
,
enable_test
=
True
,
),
LASTPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Qwen/Qwen3-Embedding-4B"
,
architecture
=
"Qwen3ForCausalLM"
,
dtype
=
"float32"
,
enable_test
=
False
,
),
]
RERANK_MODELS
=
[
CLSPooling
RerankModelInfo
(
RerankModelInfo
(
# classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base"
,
mteb_score
=
0.33386
,
architecture
=
"ModernBertForSequenceClassification"
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPooling
RerankModelInfo
(
RerankModelInfo
(
"Alibaba-NLP/gte-multilingual-reranker-base"
,
mteb_score
=
0.33062
,
architecture
=
"GteNewForSequenceClassification"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewForSequenceClassification"
]},
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
]
...
...
@@ -123,7 +141,5 @@ def test_embed_models_correctness(
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
hf_runner
,
vllm_runner
,
model_info
)
def
test_rerank_models_mteb
(
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
vllm_runner
,
model_info
)
tests/models/language/pooling_mteb_test/test_intfloat.py
View file @
7e63ef82
...
...
@@ -3,40 +3,44 @@
import
pytest
from
tests.models.language.pooling.embed_utils
import
correctness_test_embed_models
from
tests.models.utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
tests.models.utils
import
EmbedModelInfo
from
.mteb_utils
import
mteb_test_embed_models
from
.mteb_
embed_
utils
import
mteb_test_embed_models
MODELS
=
[
########## BertModel
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"intfloat/e5-small"
,
architecture
=
"BertModel"
,
mteb_score
=
0.742285423
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPoolingEmbedModelInfo
(
"intfloat/e5-base"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"intfloat/e5-large"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
EmbedModelInfo
(
"intfloat/e5-base"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"intfloat/e5-large"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
EmbedModelInfo
(
"intfloat/multilingual-e5-small"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
########## XLMRobertaModel
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"intfloat/multilingual-e5-base"
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.779325955
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"intfloat/multilingual-e5-large"
,
architecture
=
"XLMRobertaModel"
,
enable_test
=
False
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"intfloat/multilingual-e5-large-instruct"
,
architecture
=
"XLMRobertaModel"
,
enable_test
=
False
,
...
...
tests/models/language/pooling_mteb_test/test_jina.py
View file @
7e63ef82
...
...
@@ -10,30 +10,36 @@ from tests.models.language.pooling.embed_utils import (
matryoshka_fy
,
)
from
tests.models.utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
RerankModelInfo
,
)
from
vllm
import
PoolingParams
from
.mteb_utils
import
mteb_test_embed_models
,
mteb_test_rerank_models
from
.mteb_embed_utils
import
mteb_test_embed_models
from
.mteb_score_utils
import
mteb_test_rerank_models
EMBEDDING_MODELS
=
[
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"jinaai/jina-embeddings-v3"
,
mteb_score
=
0.824413164
,
architecture
=
"XLMRobertaModel"
,
is_matryoshka
=
True
,
dtype
=
"float32"
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
)
]
RERANK_MODELS
=
[
CLSPooling
RerankModelInfo
(
RerankModelInfo
(
"jinaai/jina-reranker-v2-base-multilingual"
,
mteb_score
=
0.33643
,
architecture
=
"XLMRobertaForSequenceClassification"
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
)
]
...
...
@@ -65,10 +71,8 @@ def test_embed_models_correctness(
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
hf_runner
,
vllm_runner
,
model_info
)
def
test_rerank_models_mteb
(
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
vllm_runner
,
model_info
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
EMBEDDING_MODELS
)
...
...
tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
View file @
7e63ef82
...
...
@@ -2,13 +2,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
import
mteb
import
numpy
as
np
import
pytest
import
torch
from
torch.utils.data
import
DataLoader
from
tests.conftest
import
HfRunner
from
tests.models.utils
import
LASTPoolingRerankModelInfo
,
RerankModelInfo
from
tests.models.utils
import
RerankModelInfo
from
.mteb_utils
import
mteb_test_rerank_models
from
.mteb_
score_
utils
import
MtebCrossEncoderMixin
,
mteb_test_rerank_models
mxbai_rerank_hf_overrides
=
{
"architectures"
:
[
"Qwen2ForSequenceClassification"
],
...
...
@@ -17,50 +20,73 @@ mxbai_rerank_hf_overrides = {
}
RERANK_MODELS
=
[
LASTPooling
RerankModelInfo
(
RerankModelInfo
(
"mixedbread-ai/mxbai-rerank-base-v2"
,
architecture
=
"Qwen2ForSequenceClassification"
,
hf_overrides
=
mxbai_rerank_hf_overrides
,
mteb_score
=
0.273
,
seq_pooling_type
=
"LAST"
,
attn_type
=
"decoder"
,
is_prefix_caching_supported
=
True
,
is_chunked_prefill_supported
=
True
,
chat_template_name
=
"mxbai_rerank_v2.jinja"
,
mteb_score
=
0.33651
,
enable_test
=
True
,
),
LASTPooling
RerankModelInfo
(
RerankModelInfo
(
"mixedbread-ai/mxbai-rerank-large-v2"
,
architecture
=
"Qwen2ForSequenceClassification"
,
hf_overrides
=
mxbai_rerank_hf_overrides
,
chat_template_name
=
"mxbai_rerank_v2.jinja"
,
enable_test
=
False
,
),
]
class
MxbaiRerankerHfRunner
(
HfRunner
):
class
MxbaiRerankerHfRunner
(
MtebCrossEncoderMixin
,
HfRunner
):
def
__init__
(
self
,
model_name
:
str
,
dtype
:
str
=
"auto"
,
*
args
:
Any
,
**
kwargs
:
Any
)
->
None
:
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
super
().
__init__
(
model_name
,
dtype
,
auto_cls
=
AutoModelForCausalLM
)
HfRunner
.
__init__
(
self
,
model_name
=
model_name
,
auto_cls
=
AutoModelForCausalLM
,
dtype
=
dtype
,
**
kwargs
,
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
padding_side
=
"left"
)
self
.
yes_loc
=
self
.
tokenizer
.
convert_tokens_to_ids
(
"1"
)
self
.
no_loc
=
self
.
tokenizer
.
convert_tokens_to_ids
(
"0"
)
def
predict
(
self
,
prompts
:
list
[
list
[
str
]],
*
args
,
**
kwargs
)
->
torch
.
Tensor
:
def
process_inputs
(
pairs
):
inputs
=
self
.
tokenizer
(
pairs
,
padding
=
False
,
truncation
=
"longest_first"
,
return_attention_mask
=
False
,
@
torch
.
no_grad
def
predict
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
text
for
batch
in
inputs1
for
text
in
batch
[
"text"
]]
corpus
=
[
text
for
batch
in
inputs2
for
text
in
batch
[
"text"
]]
tokenizer
=
self
.
tokenizer
prompts
=
[]
for
query
,
document
in
zip
(
queries
,
corpus
):
conversation
=
[
{
"role"
:
"query"
,
"content"
:
query
},
{
"role"
:
"document"
,
"content"
:
document
},
]
prompt
=
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
tools
=
None
,
chat_template
=
self
.
chat_template
,
tokenize
=
False
,
)
for
i
,
ele
in
enumerate
(
inputs
[
"input_ids"
]):
inputs
[
"input_ids"
][
i
]
=
ele
inputs
=
self
.
tokenizer
.
pad
(
inputs
,
padding
=
True
,
return_tensors
=
"pt"
)
for
key
in
inputs
:
inputs
[
key
]
=
inputs
[
key
].
to
(
self
.
model
.
device
)
return
inputs
@
torch
.
no_grad
()
prompts
.
append
(
prompt
)
def
compute_logits
(
inputs
):
logits
=
self
.
model
(
**
inputs
).
logits
[:,
-
1
,
:]
yes_logits
=
logits
[:,
self
.
yes_loc
]
...
...
@@ -70,9 +96,9 @@ class MxbaiRerankerHfRunner(HfRunner):
return
scores
scores
=
[]
for
query
,
doc
,
*
_
in
prompts
:
pairs
=
[(
query
,
doc
)]
inputs
=
process_inputs
(
pair
s
)
for
prompt
in
prompts
:
inputs
=
tokenizer
([
prompt
],
return_tensors
=
"pt"
)
inputs
=
self
.
wrap_device
(
input
s
)
score
=
compute_logits
(
inputs
)
scores
.
append
(
score
[
0
].
item
())
return
torch
.
Tensor
(
scores
)
...
...
@@ -80,4 +106,4 @@ class MxbaiRerankerHfRunner(HfRunner):
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
MxbaiRerankerHfRunner
,
vllm_runner
,
model_info
)
mteb_test_rerank_models
(
vllm_runner
,
model_info
,
hf_runner
=
MxbaiRerankerHfRunner
)
tests/models/language/pooling_mteb_test/test_nemotron.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
tests.models.language.pooling_mteb_test.mteb_embed_utils
import
(
mteb_test_embed_models
,
)
from
tests.models.language.pooling_mteb_test.mteb_score_utils
import
(
mteb_test_rerank_models
,
)
from
tests.models.utils
import
(
EmbedModelInfo
,
RerankModelInfo
,
)
EMBEDDING_MODELS
=
[
EmbedModelInfo
(
"nvidia/llama-nemotron-embed-1b-v2"
,
architecture
=
"LlamaBidirectionalModel"
,
mteb_score
=
0.689164662128673
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
)
]
RERANK_MODELS
=
[
RerankModelInfo
(
"nvidia/llama-nemotron-rerank-1b-v2"
,
architecture
=
"LlamaBidirectionalForSequenceClassification"
,
chat_template_name
=
"nemotron-rerank.jinja"
,
mteb_score
=
0.33994
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
),
]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
EMBEDDING_MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
vllm_runner
,
model_info
)
tests/models/language/pooling_mteb_test/test_nomic.py
View file @
7e63ef82
...
...
@@ -4,30 +4,38 @@
import
pytest
from
tests.models.language.pooling.embed_utils
import
correctness_test_embed_models
from
tests.models.utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
tests.models.utils
import
EmbedModelInfo
from
.mteb_utils
import
mteb_test_embed_models
from
.mteb_
embed_
utils
import
mteb_test_embed_models
MODELS
=
[
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"nomic-ai/nomic-embed-text-v1"
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.737568559
,
enable_test
=
True
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"nomic-ai/nomic-embed-text-v1.5"
,
architecture
=
"NomicBertModel"
,
enable_test
=
False
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"nomic-ai/CodeRankEmbed"
,
architecture
=
"NomicBertModel"
,
enable_test
=
False
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"nomic-ai/nomic-embed-text-v2-moe"
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.715488912
,
enable_test
=
True
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
),
]
...
...
tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from
typing
import
Any
import
mteb
import
numpy
as
np
import
pytest
import
torch
from
torch.utils.data
import
DataLoader
from
tests.conftest
import
HfRunner
from
tests.models.utils
import
LASTPoolingRerankModelInfo
,
RerankModelInfo
from
tests.models.utils
import
RerankModelInfo
from
tests.utils
import
multi_gpu_test
from
.mteb_utils
import
mteb_test_rerank_models
from
.mteb_
score_
utils
import
MtebCrossEncoderMixin
,
mteb_test_rerank_models
qwen3_reranker_hf_overrides
=
{
"architectures"
:
[
"Qwen3ForSequenceClassification"
],
...
...
@@ -18,50 +22,74 @@ qwen3_reranker_hf_overrides = {
}
RERANK_MODELS
=
[
LASTPooling
RerankModelInfo
(
RerankModelInfo
(
"Qwen/Qwen3-Reranker-0.6B"
,
architecture
=
"Qwen3ForSequenceClassification"
,
mteb_score
=
0.25736
,
hf_overrides
=
qwen3_reranker_hf_overrides
,
chat_template_name
=
"qwen3_reranker.jinja"
,
seq_pooling_type
=
"LAST"
,
attn_type
=
"decoder"
,
is_prefix_caching_supported
=
True
,
is_chunked_prefill_supported
=
True
,
mteb_score
=
0.33459
,
enable_test
=
True
,
),
LASTPooling
RerankModelInfo
(
RerankModelInfo
(
"Qwen/Qwen3-Reranker-4B"
,
architecture
=
"Qwen3ForSequenceClassification"
,
chat_template_name
=
"qwen3_reranker.jinja"
,
hf_overrides
=
qwen3_reranker_hf_overrides
,
enable_test
=
False
,
),
]
class
Qwen3RerankerHfRunner
(
HfRunner
):
class
Qwen3RerankerHfRunner
(
MtebCrossEncoderMixin
,
HfRunner
):
def
__init__
(
self
,
model_name
:
str
,
dtype
:
str
=
"auto"
,
*
args
:
Any
,
**
kwargs
:
Any
)
->
None
:
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
super
().
__init__
(
model_name
,
dtype
,
auto_cls
=
AutoModelForCausalLM
)
HfRunner
.
__init__
(
self
,
model_name
=
model_name
,
auto_cls
=
AutoModelForCausalLM
,
dtype
=
dtype
,
**
kwargs
,
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
padding_side
=
"left"
)
self
.
token_false_id
=
self
.
tokenizer
.
convert_tokens_to_ids
(
"no"
)
self
.
token_true_id
=
self
.
tokenizer
.
convert_tokens_to_ids
(
"yes"
)
def
predict
(
self
,
prompts
:
list
[
list
[
str
]],
*
args
,
**
kwargs
)
->
torch
.
Tensor
:
def
process_inputs
(
pairs
):
inputs
=
self
.
tokenizer
(
pairs
,
padding
=
False
,
truncation
=
"longest_first"
,
return_attention_mask
=
False
,
self
.
max_length
=
40960
@
torch
.
no_grad
def
predict
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
text
for
batch
in
inputs1
for
text
in
batch
[
"text"
]]
corpus
=
[
text
for
batch
in
inputs2
for
text
in
batch
[
"text"
]]
tokenizer
=
self
.
tokenizer
prompts
=
[]
for
query
,
document
in
zip
(
queries
,
corpus
):
conversation
=
[
{
"role"
:
"query"
,
"content"
:
query
},
{
"role"
:
"document"
,
"content"
:
document
},
]
prompt
=
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
tools
=
None
,
chat_template
=
self
.
chat_template
,
tokenize
=
False
,
)
for
i
,
ele
in
enumerate
(
inputs
[
"input_ids"
]):
inputs
[
"input_ids"
][
i
]
=
ele
inputs
=
self
.
tokenizer
.
pad
(
inputs
,
padding
=
True
,
return_tensors
=
"pt"
)
for
key
in
inputs
:
inputs
[
key
]
=
inputs
[
key
].
to
(
self
.
model
.
device
)
return
inputs
@
torch
.
no_grad
()
prompts
.
append
(
prompt
)
def
compute_logits
(
inputs
):
batch_scores
=
self
.
model
(
**
inputs
).
logits
[:,
-
1
,
:]
true_vector
=
batch_scores
[:,
self
.
token_true_id
]
...
...
@@ -72,9 +100,9 @@ class Qwen3RerankerHfRunner(HfRunner):
return
scores
scores
=
[]
for
query
,
doc
,
*
_
in
prompts
:
pairs
=
[(
query
,
doc
)]
inputs
=
process_inputs
(
pair
s
)
for
prompt
in
prompts
:
inputs
=
tokenizer
([
prompt
],
return_tensors
=
"pt"
)
inputs
=
self
.
wrap_device
(
input
s
)
score
=
compute_logits
(
inputs
)
scores
.
append
(
score
[
0
].
item
())
return
torch
.
Tensor
(
scores
)
...
...
@@ -82,7 +110,7 @@ class Qwen3RerankerHfRunner(HfRunner):
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
mteb_test_rerank_models
(
Qwen3RerankerHfRunner
,
vllm_runner
,
model_info
)
mteb_test_rerank_models
(
vllm_runner
,
model_info
,
hf_runner
=
Qwen3RerankerHfRunner
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
...
...
@@ -95,5 +123,8 @@ def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None
}
mteb_test_rerank_models
(
Qwen3RerankerHfRunner
,
vllm_runner
,
model_info
,
vllm_extra_kwargs
vllm_runner
,
model_info
,
vllm_extra_kwargs
=
vllm_extra_kwargs
,
hf_runner
=
Qwen3RerankerHfRunner
,
)
tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
View file @
7e63ef82
...
...
@@ -4,62 +4,82 @@
import
pytest
from
tests.models.language.pooling.embed_utils
import
correctness_test_embed_models
from
tests.models.utils
import
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
from
tests.models.utils
import
EmbedModelInfo
from
.mteb_utils
import
mteb_test_embed_models
from
.mteb_
embed_
utils
import
mteb_test_embed_models
MODELS
=
[
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-xs"
,
is_matryoshka
=
False
,
architecture
=
"BertModel"
,
mteb_score
=
0.714927797
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-s"
,
is_matryoshka
=
False
,
architecture
=
"BertModel"
,
enable_test
=
False
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m"
,
is_matryoshka
=
False
,
architecture
=
"BertModel"
,
enable_test
=
False
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-long"
,
is_matryoshka
=
False
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.681146831
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-l"
,
is_matryoshka
=
False
,
architecture
=
"BertModel"
,
enable_test
=
False
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v1.5"
,
is_matryoshka
=
True
,
architecture
=
"BertModel"
,
mteb_score
=
0.649088363
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-l-v2.0"
,
is_matryoshka
=
True
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.712258299
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
is_matryoshka
=
True
,
architecture
=
"GteModel"
,
mteb_score
=
0.706622444
,
seq_pooling_type
=
"CLS"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
]
...
...
tests/models/language/pooling_mteb_test/test_st_projector.py
View file @
7e63ef82
...
...
@@ -3,27 +3,32 @@
import
pytest
from
tests.models.utils
import
(
CLSPoolingEmbedModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
)
from
.mteb_utils
import
mteb_test_embed_models
from
.mteb_
embed_
utils
import
mteb_test_embed_models
# ST models with projector (Dense) layers
ST_PROJECTOR_MODELS
=
[
CLSPooling
EmbedModelInfo
(
EmbedModelInfo
(
"TencentBAC/Conan-embedding-v1"
,
architecture
=
"BertModel"
,
mteb_score
=
0.688611955
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
),
LASTPooling
EmbedModelInfo
(
EmbedModelInfo
(
"google/embeddinggemma-300m"
,
architecture
=
"Gemma3TextModel"
,
mteb_score
=
0.7473819294684156
,
seq_pooling_type
=
"MEAN"
,
attn_type
=
"encoder_only"
,
is_prefix_caching_supported
=
False
,
is_chunked_prefill_supported
=
False
,
enable_test
=
True
,
dtype
=
"float32"
,
),
]
...
...
tests/models/multimodal/
generation/
conftest.py
→
tests/models/multimodal/conftest.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
"""Pytest configuration for vLLM
multimodal
tests."""
import
warnings
...
...
@@ -9,20 +9,17 @@ import torch
from
vllm.platforms
import
current_platform
def
pytest_configure
(
config
):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
Transformers accuracy issues.
"""
def
pytest_collection_modifyitems
(
config
,
items
):
"""Configure ROCm-specific settings based on collected tests."""
if
not
current_platform
.
is_rocm
():
return
skip_patterns
=
[
"test_granite_speech.py"
]
if
any
(
pattern
in
str
(
arg
)
for
arg
in
config
.
args
for
pattern
in
skip_patterns
):
# Skip disabling SDP for Granite Speech tests on ROCm
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues
# accuracy issues
: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch
.
backends
.
cuda
.
enable_flash_sdp
(
False
)
torch
.
backends
.
cuda
.
enable_mem_efficient_sdp
(
False
)
...
...
tests/models/multimodal/generation/test_common.py
View file @
7e63ef82
...
...
@@ -123,10 +123,6 @@ VLM_TEST_SETTINGS = {
),
auto_cls
=
AutoModelForImageTextToText
,
vllm_output_post_proc
=
model_utils
.
paligemma_vllm_to_hf_output
,
dtype
=
"bfloat16"
,
marks
=
[
pytest
.
mark
.
skip
(
reason
=
"vLLM does not support PrefixLM attention mask"
)
],
),
"qwen2_5_vl"
:
VLMTestInfo
(
...
...
@@ -176,6 +172,13 @@ VLM_TEST_SETTINGS = {
auto_cls
=
AutoModelForImageTextToText
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
patch_hf_runner
=
model_utils
.
qwen3_vl_patch_hf_runner
,
vllm_runner_kwargs
=
{
"attention_config"
:
{
"backend"
:
"ROCM_AITER_FA"
,
},
}
if
current_platform
.
is_rocm
()
else
None
,
image_size_factors
=
[(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
marks
=
[
pytest
.
mark
.
core_model
,
...
...
@@ -256,8 +259,19 @@ VLM_TEST_SETTINGS = {
image_size_factors
=
[(
0.25
,
0.2
,
0.15
)],
vllm_runner_kwargs
=
{
"model_impl"
:
"transformers"
,
# TODO: [ROCm] Revert this once issue #30167 is resolved
**
(
{
"mm_processor_kwargs"
:
{
"min_pixels"
:
256
*
28
*
28
,
"max_pixels"
:
1280
*
28
*
28
,
},
}
if
current_platform
.
is_rocm
()
else
{}
),
},
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
marks
=
[
large_gpu_mark
(
min_gb
=
80
if
current_platform
.
is_rocm
()
else
32
)],
),
#### Extended model tests
"aria"
:
VLMTestInfo
(
...
...
@@ -498,6 +512,7 @@ VLM_TEST_SETTINGS = {
max_model_len
=
8192
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
num_logprobs
=
10
if
current_platform
.
is_rocm
()
else
5
,
),
"intern_vl-hf"
:
VLMTestInfo
(
models
=
[
"OpenGVLab/InternVL3-1B-hf"
],
...
...
@@ -513,6 +528,34 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos
=
True
,
auto_cls
=
AutoModelForImageTextToText
,
),
"isaac"
:
VLMTestInfo
(
models
=
[
"PerceptronAI/Isaac-0.1"
,
"PerceptronAI/Isaac-0.2-2B-Preview"
,
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
(
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
),
img_idx_to_prompt
=
lambda
idx
:
"<image>"
,
single_image_prompts
=
IMAGE_ASSETS
.
prompts
(
{
"stop_sign"
:
"<vlm_image>Please describe the image shortly."
,
"cherry_blossom"
:
"<vlm_image>Please infer the season with reason."
,
}
),
multi_image_prompt
=
(
"Picture 1: <vlm_image>
\n
"
"Picture 2: <vlm_image>
\n
"
"Describe these two images with one paragraph respectively."
),
enforce_eager
=
False
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
hf_model_kwargs
=
{
"device_map"
:
"auto"
},
patch_hf_runner
=
model_utils
.
isaac_patch_hf_runner
,
image_size_factors
=
[(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
),
"kimi_vl"
:
VLMTestInfo
(
models
=
[
"moonshotai/Kimi-VL-A3B-Instruct"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
...
...
@@ -648,7 +691,17 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc
=
model_utils
.
minimax_vl_01_hf_output
,
patch_hf_runner
=
model_utils
.
minimax_vl_01_patch_hf_runner
,
auto_cls
=
AutoModelForImageTextToText
,
marks
=
[
large_gpu_mark
(
min_gb
=
80
)],
marks
=
[
large_gpu_mark
(
min_gb
=
80
),
# TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
(
"ROCm: Model too large for single GPU; "
"multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
),
),
],
),
"molmo"
:
VLMTestInfo
(
models
=
[
"allenai/Molmo-7B-D-0924"
],
...
...
tests/models/multimodal/generation/test_granite_speech.py
View file @
7e63ef82
...
...
@@ -37,10 +37,12 @@ audio_lora_path = MODEL_NAME
models
=
[
MODEL_NAME
]
@
pytest
.
fixture
(
autouse
=
True
)
def
set_attention_backend_for_rocm
(
monkeypatch
):
@
pytest
.
fixture
def
granite_speech_attention_config
():
"""Return attention config for Granite Speech tests on ROCm."""
if
current_platform
.
is_rocm
():
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
return
{
"backend"
:
"ROCM_AITER_FA"
}
return
None
def
run_test
(
...
...
@@ -55,6 +57,7 @@ def run_test(
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
str
|
None
=
None
,
attention_config
:
dict
|
None
=
None
,
):
"""Inference result should be the same between hf and vllm.
...
...
@@ -82,6 +85,7 @@ def run_test(
enable_lora
=
True
,
max_lora_rank
=
64
,
enforce_eager
=
True
,
attention_config
=
attention_config
,
)
as
vllm_model
:
lora_request
=
LoRARequest
(
"audio"
,
1
,
audio_lora_path
)
vllm_outputs_per_case
=
[
...
...
@@ -133,6 +137,7 @@ def test_models(
vllm_runner
,
model
:
str
,
audio_assets
:
AudioTestAssets
,
granite_speech_attention_config
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
...
...
@@ -159,4 +164,5 @@ def test_models(
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
\ No newline at end of file
attention_config
=
granite_speech_attention_config
,
)
tests/models/multimodal/generation/test_keye.py
View file @
7e63ef82
...
...
@@ -8,7 +8,7 @@ from PIL.Image import Image
from
transformers
import
AutoProcessor
from
vllm
import
LLM
,
EngineArgs
,
SamplingParams
from
vllm.multimodal.utils
import
encode_image_
base64
from
vllm.multimodal.utils
import
encode_image_
url
MODEL_NAME
=
"Kwai-Keye/Keye-VL-8B-Preview"
...
...
@@ -31,10 +31,7 @@ def test_keye_vl(
question
:
str
,
):
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
image_urls
=
[
f
"data:image/jpeg;base64,
{
encode_image_base64
(
image
)
}
"
for
image
in
images
]
image_urls
=
[
encode_image_url
(
image
)
for
image
in
images
]
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
,
...
...
tests/models/multimodal/generation/test_nemotron_parse.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
import
pytest
from
transformers
import
AutoModel
from
tests.models.utils
import
check_logprobs_close
from
vllm.assets.image
import
ImageAsset
from
....conftest
import
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
create_new_process_for_each_test
IMAGE
=
ImageAsset
(
"paper-11"
).
pil_image_ext
(
ext
=
"png"
).
convert
(
"RGB"
)
PROMPT
=
"</s><s><predict_bbox><predict_classes><output_markdown>"
def
run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
Sequence
[
tuple
[
list
[
str
],
PromptImageInput
]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
"""Verify that the inference result is the same between hf and vllm."""
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
64
,
limit_mm_per_prompt
=
{
"image"
:
1
},
trust_remote_code
=
True
,
)
as
vllm_model
:
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
)
for
prompts
,
images
in
inputs
]
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
use_cache
=
False
,
# HF Nemotron Parse crashes here without this
)
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"nvidia/NVIDIA-Nemotron-Parse-v1.1"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
create_new_process_for_each_test
(
"spawn"
)
def
test_models
(
hf_runner
,
vllm_runner
,
model
:
str
,
dtype
:
str
,
num_logprobs
:
int
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
inputs
=
[
(
[
PROMPT
]
*
10
,
[
IMAGE
]
*
10
,
),
],
model
=
model
,
dtype
=
dtype
,
max_tokens
=
100
,
num_logprobs
=
num_logprobs
,
)
tests/models/multimodal/generation/test_qwen2_vl.py
View file @
7e63ef82
...
...
@@ -269,7 +269,7 @@ def run_embedding_input_test(
"""Inference result should be the same between
original image/video input and image/video embeddings input.
"""
from
transformers
import
AutoProcessor
# noqa: F401
from
transformers
import
AutoProcessor
processor
=
AutoProcessor
.
from_pretrained
(
model
)
...
...
tests/models/multimodal/generation/test_vit_backend_functionality.py
View file @
7e63ef82
...
...
@@ -14,10 +14,10 @@ import pytest
from
transformers
import
AutoProcessor
from
vllm
import
LLM
,
EngineArgs
,
SamplingParams
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.multimodal.utils
import
encode_image_base64
from
vllm.multimodal.utils
import
encode_image_url
from
vllm.multimodal.video
import
sample_frames_from_video
from
vllm.platforms
import
current_platform
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
....utils
import
create_new_process_for_each_test
from
...utils
import
dummy_hf_overrides
...
...
@@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
"""Build Dots.OCR specific prompt with OCR instructions."""
# Use only stop_sign image for Dots.OCR
image
=
images
[
0
]
# Already filtered to stop_sign
image_url
=
f
"data:image/jpeg;base64,
{
encode_image_base64
(
image
)
}
"
image_url
=
encode_image_url
(
image
)
placeholders
=
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}}]
messages
=
[
...
...
@@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
config
[
"model_name"
],
trust_remote_code
=
True
)
image_urls
=
[
f
"data:image/jpeg;base64,
{
encode_image_base64
(
img
)
}
"
for
img
in
images
]
image_urls
=
[
encode_image_url
(
img
)
for
img
in
images
]
placeholders
=
[{
"type"
:
"image"
,
"image"
:
url
}
for
url
in
image_urls
]
messages
=
[
{
...
...
@@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
def
build_ovis_prompt
(
images
,
config
):
"""Build Ovis2.5 specific prompt with custom format."""
image_urls
=
[
f
"data:image/jpeg;base64,
{
encode_image_base64
(
img
)
}
"
for
img
in
images
]
image_urls
=
[
encode_image_url
(
img
)
for
img
in
images
]
placeholders
=
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
)
...
...
tests/models/multimodal/generation/test_voxtral.py
View file @
7e63ef82
...
...
@@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
message
.
content
==
"In the first audio clip, you hear a brief"
assert
choice
.
finish_reason
==
"length"
Prev
1
…
23
24
25
26
27
28
29
30
31
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment