Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
51383bd4
Unverified
Commit
51383bd4
authored
Sep 03, 2025
by
wang.yuqi
Committed by
GitHub
Sep 03, 2025
Browse files
[CI] Accelerate mteb test by setting SentenceTransformers mteb score to a constant (#24088)
Signed-off-by:
wang.yuqi
<
noooop@126.com
>
parent
9c99e487
Changes
17
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
83 additions
and
52 deletions
+83
-52
tests/entrypoints/openai/correctness/test_mteb_embed.py
tests/entrypoints/openai/correctness/test_mteb_embed.py
+3
-1
tests/entrypoints/openai/correctness/test_mteb_score.py
tests/entrypoints/openai/correctness/test_mteb_score.py
+15
-16
tests/models/language/pooling/embed_utils.py
tests/models/language/pooling/embed_utils.py
+1
-4
tests/models/language/pooling/mteb_utils.py
tests/models/language/pooling/mteb_utils.py
+24
-12
tests/models/language/pooling/test_baai.py
tests/models/language/pooling/test_baai.py
+4
-0
tests/models/language/pooling/test_bge_reranker_v2_gemma.py
tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+1
-2
tests/models/language/pooling/test_cross_encoder.py
tests/models/language/pooling/test_cross_encoder.py
+2
-0
tests/models/language/pooling/test_embedding.py
tests/models/language/pooling/test_embedding.py
+1
-4
tests/models/language/pooling/test_gte.py
tests/models/language/pooling/test_gte.py
+15
-11
tests/models/language/pooling/test_intfloat.py
tests/models/language/pooling/test_intfloat.py
+3
-1
tests/models/language/pooling/test_jina.py
tests/models/language/pooling/test_jina.py
+2
-0
tests/models/language/pooling/test_mxbai_rerank.py
tests/models/language/pooling/test_mxbai_rerank.py
+1
-0
tests/models/language/pooling/test_nomic.py
tests/models/language/pooling/test_nomic.py
+2
-0
tests/models/language/pooling/test_qwen3_reranker.py
tests/models/language/pooling/test_qwen3_reranker.py
+1
-0
tests/models/language/pooling/test_snowflake_arctic_embed.py
tests/models/language/pooling/test_snowflake_arctic_embed.py
+6
-1
tests/models/language/pooling/test_st_projector.py
tests/models/language/pooling/test_st_projector.py
+1
-0
tests/models/utils.py
tests/models/utils.py
+1
-0
No files found.
tests/entrypoints/openai/correctness/test_mteb_embed.py
View file @
51383bd4
...
@@ -37,4 +37,6 @@ def test_mteb_embed(server):
...
@@ -37,4 +37,6 @@ def test_mteb_embed(server):
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
MTEB_EMBED_TOL
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
MTEB_EMBED_TOL
tests/entrypoints/openai/correctness/test_mteb_score.py
View file @
51383bd4
...
@@ -6,16 +6,19 @@ import pytest
...
@@ -6,16 +6,19 @@ import pytest
# yapf conflicts with isort for this block
# yapf conflicts with isort for this block
# yapf: disable
# yapf: disable
from
tests.models.language.pooling.mteb_utils
import
(
from
tests.models.language.pooling.mteb_utils
import
(
MTEB_RERANK_LANGS
,
MTEB_RERANK_LANGS
,
MTEB_RERANK_TASKS
,
MTEB_RERANK_TOL
,
MTEB_RERANK_TASKS
,
RerankClientMtebEncoder
,
ScoreClientMtebEncoder
,
MTEB_RERANK_TOL
,
mteb_test_rerank_models_hf
,
run_mteb_rerank
)
RerankClientMtebEncoder
,
ScoreClientMtebEncoder
,
run_mteb_rerank
)
# yapf: enable
# yapf: enable
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
os
.
environ
[
"VLLM_LOGGING_LEVEL"
]
=
"WARNING"
os
.
environ
[
"VLLM_LOGGING_LEVEL"
]
=
"WARNING"
MODEL_NAME
=
"cross-encoder/ms-marco-MiniLM-L-6-v2"
MODEL_NAME
=
"cross-encoder/ms-marco-MiniLM-L-6-v2"
st_main_score
=
0.33457
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
@@ -29,15 +32,7 @@ def server():
...
@@ -29,15 +32,7 @@ def server():
yield
remote_server
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
test_mteb_score
(
server
):
def
st_main_score
(
hf_runner
):
# The main score related to the version of the dependency.
# So we need to recalculate every time.
main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
hf_runner
,
MODEL_NAME
)
return
main_score
def
test_mteb_score
(
server
,
st_main_score
):
url
=
server
.
url_for
(
"score"
)
url
=
server
.
url_for
(
"score"
)
encoder
=
ScoreClientMtebEncoder
(
MODEL_NAME
,
url
)
encoder
=
ScoreClientMtebEncoder
(
MODEL_NAME
,
url
)
vllm_main_score
=
run_mteb_rerank
(
encoder
,
MTEB_RERANK_TASKS
,
vllm_main_score
=
run_mteb_rerank
(
encoder
,
MTEB_RERANK_TASKS
,
...
@@ -47,10 +42,12 @@ def test_mteb_score(server, st_main_score):
...
@@ -47,10 +42,12 @@ def test_mteb_score(server, st_main_score):
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
MTEB_RERANK_TOL
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
MTEB_RERANK_TOL
def
test_mteb_rerank
(
server
,
st_main_score
):
def
test_mteb_rerank
(
server
):
url
=
server
.
url_for
(
"rerank"
)
url
=
server
.
url_for
(
"rerank"
)
encoder
=
RerankClientMtebEncoder
(
MODEL_NAME
,
url
)
encoder
=
RerankClientMtebEncoder
(
MODEL_NAME
,
url
)
vllm_main_score
=
run_mteb_rerank
(
encoder
,
MTEB_RERANK_TASKS
,
vllm_main_score
=
run_mteb_rerank
(
encoder
,
MTEB_RERANK_TASKS
,
...
@@ -60,4 +57,6 @@ def test_mteb_rerank(server, st_main_score):
...
@@ -60,4 +57,6 @@ def test_mteb_rerank(server, st_main_score):
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
MTEB_RERANK_TOL
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
MTEB_RERANK_TOL
tests/models/language/pooling/embed_utils.py
View file @
51383bd4
...
@@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner,
...
@@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner,
example_prompts
,
example_prompts
,
vllm_extra_kwargs
=
None
,
vllm_extra_kwargs
=
None
,
hf_model_callback
=
None
):
hf_model_callback
=
None
):
if
not
model_info
.
enable_test
:
pytest
.
skip
(
"Debug only, ci prefers to use mteb test."
)
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest
.
skip
(
"Skipping test."
)
# The example_prompts has ending "\n", for example:
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# "Write a short story about a robot that dreams for the first time.\n"
...
...
tests/models/language/pooling/mteb_utils.py
View file @
51383bd4
...
@@ -18,7 +18,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo
...
@@ -18,7 +18,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo
# - Different model results in differences more than 1e-3
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS
=
[
"STS12"
]
MTEB_EMBED_TASKS
=
[
"STS12"
]
MTEB_EMBED_TOL
=
0.02
MTEB_EMBED_TOL
=
1e-4
# See #19344
# See #19344
MTEB_RERANK_TASKS
=
[
"NFCorpus"
]
MTEB_RERANK_TASKS
=
[
"NFCorpus"
]
...
@@ -192,6 +192,7 @@ def mteb_test_embed_models(hf_runner,
...
@@ -192,6 +192,7 @@ def mteb_test_embed_models(hf_runner,
MTEB_EMBED_TASKS
)
MTEB_EMBED_TASKS
)
vllm_dtype
=
vllm_model
.
llm
.
llm_engine
.
model_config
.
dtype
vllm_dtype
=
vllm_model
.
llm
.
llm_engine
.
model_config
.
dtype
if
model_info
.
mteb_score
is
None
:
with
hf_runner
(
model_info
.
name
,
with
hf_runner
(
model_info
.
name
,
is_sentence_transformer
=
True
,
is_sentence_transformer
=
True
,
dtype
=
"float32"
)
as
hf_model
:
dtype
=
"float32"
)
as
hf_model
:
...
@@ -201,13 +202,18 @@ def mteb_test_embed_models(hf_runner,
...
@@ -201,13 +202,18 @@ def mteb_test_embed_models(hf_runner,
st_main_score
=
run_mteb_embed_task
(
hf_model
,
MTEB_EMBED_TASKS
)
st_main_score
=
run_mteb_embed_task
(
hf_model
,
MTEB_EMBED_TASKS
)
st_dtype
=
next
(
hf_model
.
model
.
parameters
()).
dtype
st_dtype
=
next
(
hf_model
.
model
.
parameters
()).
dtype
else
:
st_main_score
=
model_info
.
mteb_score
st_dtype
=
"Constant"
print
(
"Model:"
,
model_info
.
name
)
print
(
"Model:"
,
model_info
.
name
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_main_score
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_main_score
)
print
(
"SentenceTransformers:"
,
st_dtype
,
st_main_score
)
print
(
"SentenceTransformers:"
,
st_dtype
,
st_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
atol
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
atol
def
run_mteb_rerank
(
cross_encoder
,
tasks
,
languages
):
def
run_mteb_rerank
(
cross_encoder
,
tasks
,
languages
):
...
@@ -310,12 +316,18 @@ def mteb_test_rerank_models(hf_runner,
...
@@ -310,12 +316,18 @@ def mteb_test_rerank_models(hf_runner,
languages
=
MTEB_RERANK_LANGS
)
languages
=
MTEB_RERANK_LANGS
)
vllm_dtype
=
model_config
.
dtype
vllm_dtype
=
model_config
.
dtype
if
model_info
.
mteb_score
is
None
:
st_main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
st_main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
hf_runner
,
model_info
.
name
,
hf_model_callback
)
hf_runner
,
model_info
.
name
,
hf_model_callback
)
else
:
st_main_score
=
model_info
.
mteb_score
st_dtype
=
"Constant"
print
(
"Model:"
,
model_info
.
name
)
print
(
"Model:"
,
model_info
.
name
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_main_score
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_main_score
)
print
(
"SentenceTransformers:"
,
st_dtype
,
st_main_score
)
print
(
"SentenceTransformers:"
,
st_dtype
,
st_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
atol
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
atol
tests/models/language/pooling/test_baai.py
View file @
51383bd4
...
@@ -12,6 +12,7 @@ MODELS = [
...
@@ -12,6 +12,7 @@ MODELS = [
########## BertModel
########## BertModel
CLSPoolingEmbedModelInfo
(
"BAAI/bge-base-en"
,
CLSPoolingEmbedModelInfo
(
"BAAI/bge-base-en"
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
mteb_score
=
0.779336792
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"BAAI/bge-base-zh"
,
CLSPoolingEmbedModelInfo
(
"BAAI/bge-base-zh"
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
...
@@ -52,10 +53,12 @@ MODELS = [
...
@@ -52,10 +53,12 @@ MODELS = [
########## XLMRobertaModel
########## XLMRobertaModel
CLSPoolingEmbedModelInfo
(
"BAAI/bge-m3"
,
CLSPoolingEmbedModelInfo
(
"BAAI/bge-m3"
,
architecture
=
"XLMRobertaModel"
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.787343078
,
enable_test
=
True
),
enable_test
=
True
),
########## Qwen2Model
########## Qwen2Model
LASTPoolingEmbedModelInfo
(
"BAAI/bge-code-v1"
,
LASTPoolingEmbedModelInfo
(
"BAAI/bge-code-v1"
,
architecture
=
"Qwen2Model"
,
architecture
=
"Qwen2Model"
,
mteb_score
=
0.75724465
,
dtype
=
"float32"
,
dtype
=
"float32"
,
enable_test
=
True
),
enable_test
=
True
),
]
]
...
@@ -65,6 +68,7 @@ RERANK_MODELS = [
...
@@ -65,6 +68,7 @@ RERANK_MODELS = [
CLSPoolingRerankModelInfo
(
CLSPoolingRerankModelInfo
(
"BAAI/bge-reranker-base"
,
"BAAI/bge-reranker-base"
,
architecture
=
"XLMRobertaForSequenceClassification"
,
architecture
=
"XLMRobertaForSequenceClassification"
,
mteb_score
=
0.32398
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingRerankModelInfo
(
CLSPoolingRerankModelInfo
(
"BAAI/bge-reranker-large"
,
"BAAI/bge-reranker-large"
,
...
...
tests/models/language/pooling/test_bge_reranker_v2_gemma.py
View file @
51383bd4
...
@@ -104,7 +104,6 @@ class GemmaMtebEncoder(VllmMtebEncoder):
...
@@ -104,7 +104,6 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
prompt
=
PROMPT
self
.
query_template
=
"A: {query}
\n
"
self
.
query_template
=
"A: {query}
\n
"
self
.
document_template
=
"B: {doc}
\n
{prompt}"
self
.
document_template
=
"B: {doc}
\n
{prompt}"
...
@@ -119,7 +118,7 @@ class GemmaMtebEncoder(VllmMtebEncoder):
...
@@ -119,7 +118,7 @@ class GemmaMtebEncoder(VllmMtebEncoder):
_sentences
=
[]
_sentences
=
[]
for
query
,
corpus
,
prompt
in
sentences
:
for
query
,
corpus
,
prompt
in
sentences
:
query
=
self
.
query_template
.
format
(
query
=
query
)
query
=
self
.
query_template
.
format
(
query
=
query
)
corpus
=
self
.
document_template
.
format
(
doc
=
corpus
,
prompt
=
prompt
)
corpus
=
self
.
document_template
.
format
(
doc
=
corpus
,
prompt
=
PROMPT
)
_sentences
.
append
((
query
,
corpus
,
prompt
))
_sentences
.
append
((
query
,
corpus
,
prompt
))
return
super
().
predict
(
_sentences
,
*
args
,
**
kwargs
)
return
super
().
predict
(
_sentences
,
*
args
,
**
kwargs
)
...
...
tests/models/language/pooling/test_cross_encoder.py
View file @
51383bd4
...
@@ -8,8 +8,10 @@ from .mteb_utils import mteb_test_rerank_models
...
@@ -8,8 +8,10 @@ from .mteb_utils import mteb_test_rerank_models
RERANK_MODELS
=
[
RERANK_MODELS
=
[
CLSPoolingRerankModelInfo
(
"cross-encoder/ms-marco-TinyBERT-L-2-v2"
,
CLSPoolingRerankModelInfo
(
"cross-encoder/ms-marco-TinyBERT-L-2-v2"
,
mteb_score
=
0.32898
,
architecture
=
"BertForSequenceClassification"
),
architecture
=
"BertForSequenceClassification"
),
LASTPoolingRerankModelInfo
(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
,
LASTPoolingRerankModelInfo
(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
,
mteb_score
=
0.25736
,
architecture
=
"Qwen3ForSequenceClassification"
)
architecture
=
"Qwen3ForSequenceClassification"
)
]
]
...
...
tests/models/language/pooling/test_embedding.py
View file @
51383bd4
...
@@ -7,7 +7,7 @@ import pytest
...
@@ -7,7 +7,7 @@ import pytest
from
vllm.config
import
PoolerConfig
from
vllm.config
import
PoolerConfig
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
...utils
import
check_embeddings_close
,
check_transformers_version
from
...utils
import
check_embeddings_close
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -30,7 +30,6 @@ from ...utils import check_embeddings_close, check_transformers_version
...
@@ -30,7 +30,6 @@ from ...utils import check_embeddings_close, check_transformers_version
pytest
.
param
(
"BAAI/bge-base-en-v1.5"
,
marks
=
[
pytest
.
mark
.
core_model
]),
pytest
.
param
(
"BAAI/bge-base-en-v1.5"
,
marks
=
[
pytest
.
mark
.
core_model
]),
pytest
.
param
(
"sentence-transformers/all-MiniLM-L12-v2"
),
pytest
.
param
(
"sentence-transformers/all-MiniLM-L12-v2"
),
pytest
.
param
(
"intfloat/multilingual-e5-small"
),
pytest
.
param
(
"intfloat/multilingual-e5-small"
),
pytest
.
param
(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
),
# [Cross-Encoder]
# [Cross-Encoder]
pytest
.
param
(
"sentence-transformers/stsb-roberta-base-v2"
),
pytest
.
param
(
"sentence-transformers/stsb-roberta-base-v2"
),
],
],
...
@@ -42,8 +41,6 @@ def test_models(
...
@@ -42,8 +41,6 @@ def test_models(
model
,
model
,
monkeypatch
,
monkeypatch
,
)
->
None
:
)
->
None
:
if
model
==
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
:
check_transformers_version
(
model
,
max_transformers_version
=
"4.53.2"
)
if
model
==
"BAAI/bge-multilingual-gemma2"
and
current_platform
.
is_rocm
():
if
model
==
"BAAI/bge-multilingual-gemma2"
and
current_platform
.
is_rocm
():
# ROCm Triton FA does not currently support sliding window attention
# ROCm Triton FA does not currently support sliding window attention
...
...
tests/models/language/pooling/test_gte.py
View file @
51383bd4
...
@@ -5,13 +5,14 @@ import pytest
...
@@ -5,13 +5,14 @@ import pytest
from
...utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
from
...utils
import
(
CLSPoolingEmbedModelInfo
,
CLSPoolingRerankModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
EmbedModelInfo
,
LASTPoolingEmbedModelInfo
,
RerankModelInfo
,
check_transformers_version
)
RerankModelInfo
)
from
.embed_utils
import
correctness_test_embed_models
from
.embed_utils
import
correctness_test_embed_models
from
.mteb_utils
import
mteb_test_embed_models
,
mteb_test_rerank_models
from
.mteb_utils
import
mteb_test_embed_models
,
mteb_test_rerank_models
MODELS
=
[
MODELS
=
[
########## BertModel
########## BertModel
CLSPoolingEmbedModelInfo
(
"thenlper/gte-large"
,
CLSPoolingEmbedModelInfo
(
"thenlper/gte-large"
,
mteb_score
=
0.76807651
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"thenlper/gte-base"
,
CLSPoolingEmbedModelInfo
(
"thenlper/gte-base"
,
...
@@ -30,28 +31,37 @@ MODELS = [
...
@@ -30,28 +31,37 @@ MODELS = [
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
enable_test
=
False
),
enable_test
=
False
),
########### NewModel
########### NewModel
# These three architectures are almost the same, but not exactly the same.
# For example,
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-multilingual-base"
,
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-multilingual-base"
,
architecture
=
"GteNewModel"
,
architecture
=
"GteNewModel"
,
mteb_score
=
0.775074696
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-base-en-v1.5"
,
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-base-en-v1.5"
,
architecture
=
"GteNewModel"
,
architecture
=
"GteNewModel"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
enable_test
=
Tru
e
),
enable_test
=
Fals
e
),
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-large-en-v1.5"
,
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-large-en-v1.5"
,
architecture
=
"GteNewModel"
,
architecture
=
"GteNewModel"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]},
enable_test
=
Tru
e
),
enable_test
=
Fals
e
),
########### Qwen2ForCausalLM
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
,
LASTPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
,
mteb_score
=
0.758473459018872
,
architecture
=
"Qwen2ForCausalLM"
,
architecture
=
"Qwen2ForCausalLM"
,
enable_test
=
True
),
enable_test
=
True
),
########## ModernBertModel
########## ModernBertModel
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
CLSPoolingEmbedModelInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
mteb_score
=
0.748193353
,
architecture
=
"ModernBertModel"
,
architecture
=
"ModernBertModel"
,
enable_test
=
True
),
enable_test
=
True
),
########## Qwen3ForCausalLM
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo
(
"Qwen/Qwen3-Embedding-0.6B"
,
LASTPoolingEmbedModelInfo
(
"Qwen/Qwen3-Embedding-0.6B"
,
mteb_score
=
0.771163695
,
architecture
=
"Qwen3ForCausalLM"
,
architecture
=
"Qwen3ForCausalLM"
,
dtype
=
"float32"
,
dtype
=
"float32"
,
enable_test
=
True
),
enable_test
=
True
),
...
@@ -65,10 +75,12 @@ RERANK_MODELS = [
...
@@ -65,10 +75,12 @@ RERANK_MODELS = [
CLSPoolingRerankModelInfo
(
CLSPoolingRerankModelInfo
(
# classifier_pooling: mean
# classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base"
,
"Alibaba-NLP/gte-reranker-modernbert-base"
,
mteb_score
=
0.33386
,
architecture
=
"ModernBertForSequenceClassification"
,
architecture
=
"ModernBertForSequenceClassification"
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingRerankModelInfo
(
CLSPoolingRerankModelInfo
(
"Alibaba-NLP/gte-multilingual-reranker-base"
,
"Alibaba-NLP/gte-multilingual-reranker-base"
,
mteb_score
=
0.33062
,
architecture
=
"GteNewForSequenceClassification"
,
architecture
=
"GteNewForSequenceClassification"
,
hf_overrides
=
{
"architectures"
:
[
"GteNewForSequenceClassification"
]},
hf_overrides
=
{
"architectures"
:
[
"GteNewForSequenceClassification"
]},
enable_test
=
True
),
enable_test
=
True
),
...
@@ -78,10 +90,6 @@ RERANK_MODELS = [
...
@@ -78,10 +90,6 @@ RERANK_MODELS = [
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
model_info
:
EmbedModelInfo
)
->
None
:
if
model_info
.
name
==
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
:
check_transformers_version
(
model_info
.
name
,
max_transformers_version
=
"4.53.2"
)
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
)
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
)
...
@@ -89,10 +97,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
...
@@ -89,10 +97,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
def
test_embed_models_correctness
(
hf_runner
,
vllm_runner
,
def
test_embed_models_correctness
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
,
model_info
:
EmbedModelInfo
,
example_prompts
)
->
None
:
example_prompts
)
->
None
:
if
model_info
.
name
==
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
:
check_transformers_version
(
model_info
.
name
,
max_transformers_version
=
"4.53.2"
)
correctness_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
,
correctness_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
,
example_prompts
)
example_prompts
)
...
...
tests/models/language/pooling/test_intfloat.py
View file @
51383bd4
...
@@ -10,6 +10,7 @@ MODELS = [
...
@@ -10,6 +10,7 @@ MODELS = [
########## BertModel
########## BertModel
CLSPoolingEmbedModelInfo
(
"intfloat/e5-small"
,
CLSPoolingEmbedModelInfo
(
"intfloat/e5-small"
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
mteb_score
=
0.742285423
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"intfloat/e5-base"
,
CLSPoolingEmbedModelInfo
(
"intfloat/e5-base"
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
...
@@ -23,6 +24,7 @@ MODELS = [
...
@@ -23,6 +24,7 @@ MODELS = [
########## XLMRobertaModel
########## XLMRobertaModel
CLSPoolingEmbedModelInfo
(
"intfloat/multilingual-e5-base"
,
CLSPoolingEmbedModelInfo
(
"intfloat/multilingual-e5-base"
,
architecture
=
"XLMRobertaModel"
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.779325955
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"intfloat/multilingual-e5-large"
,
CLSPoolingEmbedModelInfo
(
"intfloat/multilingual-e5-large"
,
architecture
=
"XLMRobertaModel"
,
architecture
=
"XLMRobertaModel"
,
...
@@ -36,7 +38,7 @@ MODELS = [
...
@@ -36,7 +38,7 @@ MODELS = [
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
model_info
:
EmbedModelInfo
)
->
None
:
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
,
atol
=
0.02
)
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
...
...
tests/models/language/pooling/test_jina.py
View file @
51383bd4
...
@@ -14,6 +14,7 @@ from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
...
@@ -14,6 +14,7 @@ from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS
=
[
EMBEDDING_MODELS
=
[
CLSPoolingEmbedModelInfo
(
"jinaai/jina-embeddings-v3"
,
CLSPoolingEmbedModelInfo
(
"jinaai/jina-embeddings-v3"
,
mteb_score
=
0.824413164
,
architecture
=
"XLMRobertaModel"
,
architecture
=
"XLMRobertaModel"
,
is_matryoshka
=
True
)
is_matryoshka
=
True
)
]
]
...
@@ -21,6 +22,7 @@ EMBEDDING_MODELS = [
...
@@ -21,6 +22,7 @@ EMBEDDING_MODELS = [
RERANK_MODELS
=
[
RERANK_MODELS
=
[
CLSPoolingRerankModelInfo
(
CLSPoolingRerankModelInfo
(
"jinaai/jina-reranker-v2-base-multilingual"
,
"jinaai/jina-reranker-v2-base-multilingual"
,
mteb_score
=
0.33643
,
architecture
=
"XLMRobertaForSequenceClassification"
)
architecture
=
"XLMRobertaForSequenceClassification"
)
]
]
...
...
tests/models/language/pooling/test_mxbai_rerank.py
View file @
51383bd4
...
@@ -20,6 +20,7 @@ RERANK_MODELS = [
...
@@ -20,6 +20,7 @@ RERANK_MODELS = [
LASTPoolingRerankModelInfo
(
"mixedbread-ai/mxbai-rerank-base-v2"
,
LASTPoolingRerankModelInfo
(
"mixedbread-ai/mxbai-rerank-base-v2"
,
architecture
=
"Qwen2ForSequenceClassification"
,
architecture
=
"Qwen2ForSequenceClassification"
,
hf_overrides
=
mxbai_rerank_hf_overrides
,
hf_overrides
=
mxbai_rerank_hf_overrides
,
mteb_score
=
0.273
,
enable_test
=
True
),
enable_test
=
True
),
LASTPoolingRerankModelInfo
(
"mixedbread-ai/mxbai-rerank-large-v2"
,
LASTPoolingRerankModelInfo
(
"mixedbread-ai/mxbai-rerank-large-v2"
,
architecture
=
"Qwen2ForSequenceClassification"
,
architecture
=
"Qwen2ForSequenceClassification"
,
...
...
tests/models/language/pooling/test_nomic.py
View file @
51383bd4
...
@@ -10,6 +10,7 @@ from .mteb_utils import mteb_test_embed_models
...
@@ -10,6 +10,7 @@ from .mteb_utils import mteb_test_embed_models
MODELS
=
[
MODELS
=
[
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v1"
,
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v1"
,
architecture
=
"NomicBertModel"
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.737568559
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v1.5"
,
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v1.5"
,
architecture
=
"NomicBertModel"
,
architecture
=
"NomicBertModel"
,
...
@@ -19,6 +20,7 @@ MODELS = [
...
@@ -19,6 +20,7 @@ MODELS = [
enable_test
=
False
),
enable_test
=
False
),
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v2-moe"
,
CLSPoolingEmbedModelInfo
(
"nomic-ai/nomic-embed-text-v2-moe"
,
architecture
=
"NomicBertModel"
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.715488912
,
enable_test
=
True
)
enable_test
=
True
)
]
]
...
...
tests/models/language/pooling/test_qwen3_reranker.py
View file @
51383bd4
...
@@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = {
...
@@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = {
RERANK_MODELS
=
[
RERANK_MODELS
=
[
LASTPoolingRerankModelInfo
(
"Qwen/Qwen3-Reranker-0.6B"
,
LASTPoolingRerankModelInfo
(
"Qwen/Qwen3-Reranker-0.6B"
,
architecture
=
"Qwen3ForSequenceClassification"
,
architecture
=
"Qwen3ForSequenceClassification"
,
mteb_score
=
0.25736
,
hf_overrides
=
qwen3_reranker_hf_overrides
,
hf_overrides
=
qwen3_reranker_hf_overrides
,
enable_test
=
True
),
enable_test
=
True
),
LASTPoolingRerankModelInfo
(
"Qwen/Qwen3-Reranker-4B"
,
LASTPoolingRerankModelInfo
(
"Qwen/Qwen3-Reranker-4B"
,
...
...
tests/models/language/pooling/test_snowflake_arctic_embed.py
View file @
51383bd4
...
@@ -11,6 +11,7 @@ MODELS = [
...
@@ -11,6 +11,7 @@ MODELS = [
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-xs"
,
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-xs"
,
is_matryoshka
=
False
,
is_matryoshka
=
False
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
mteb_score
=
0.714927797
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-s"
,
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-s"
,
is_matryoshka
=
False
,
is_matryoshka
=
False
,
...
@@ -23,6 +24,7 @@ MODELS = [
...
@@ -23,6 +24,7 @@ MODELS = [
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-long"
,
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-long"
,
is_matryoshka
=
False
,
is_matryoshka
=
False
,
architecture
=
"NomicBertModel"
,
architecture
=
"NomicBertModel"
,
mteb_score
=
0.681146831
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-l"
,
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-l"
,
is_matryoshka
=
False
,
is_matryoshka
=
False
,
...
@@ -31,14 +33,17 @@ MODELS = [
...
@@ -31,14 +33,17 @@ MODELS = [
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v1.5"
,
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v1.5"
,
is_matryoshka
=
True
,
is_matryoshka
=
True
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
mteb_score
=
0.649088363
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-l-v2.0"
,
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-l-v2.0"
,
is_matryoshka
=
True
,
is_matryoshka
=
True
,
architecture
=
"XLMRobertaModel"
,
architecture
=
"XLMRobertaModel"
,
mteb_score
=
0.712258299
,
enable_test
=
True
),
enable_test
=
True
),
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
CLSPoolingEmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
is_matryoshka
=
True
,
is_matryoshka
=
True
,
architecture
=
"GteModel"
,
architecture
=
"GteModel"
,
mteb_score
=
0.706622444
,
enable_test
=
True
),
enable_test
=
True
),
]
]
...
@@ -46,7 +51,7 @@ MODELS = [
...
@@ -46,7 +51,7 @@ MODELS = [
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
model_info
:
EmbedModelInfo
)
->
None
:
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
,
atol
=
0.02
)
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
...
...
tests/models/language/pooling/test_st_projector.py
View file @
51383bd4
...
@@ -10,6 +10,7 @@ ST_PROJECTOR_MODELS = [
...
@@ -10,6 +10,7 @@ ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo
(
CLSPoolingEmbedModelInfo
(
"TencentBAC/Conan-embedding-v1"
,
"TencentBAC/Conan-embedding-v1"
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
mteb_score
=
0.688611955
,
enable_test
=
True
,
enable_test
=
True
,
),
),
]
]
...
...
tests/models/utils.py
View file @
51383bd4
...
@@ -347,6 +347,7 @@ class ModelInfo:
...
@@ -347,6 +347,7 @@ class ModelInfo:
dtype
:
str
=
"auto"
dtype
:
str
=
"auto"
hf_overrides
:
Optional
[
dict
[
str
,
Any
]]
=
None
hf_overrides
:
Optional
[
dict
[
str
,
Any
]]
=
None
default_pooling_type
:
str
=
""
default_pooling_type
:
str
=
""
mteb_score
:
Optional
[
float
]
=
None
enable_test
:
bool
=
True
enable_test
:
bool
=
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment