Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2836dd73
Unverified
Commit
2836dd73
authored
Jul 31, 2025
by
wang.yuqi
Committed by
GitHub
Jul 31, 2025
Browse files
[Model][CI] Let more pooling models support v1 (#21747)
Signed-off-by:
wang.yuqi
<
noooop@126.com
>
parent
d2aab336
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
14 additions
and
48 deletions
+14
-48
tests/models/language/pooling/test_classification.py
tests/models/language/pooling/test_classification.py
+0
-8
tests/models/language/pooling/test_gte.py
tests/models/language/pooling/test_gte.py
+4
-14
tests/models/language/pooling/test_jina.py
tests/models/language/pooling/test_jina.py
+0
-13
tests/models/language/pooling/test_qwen3_reranker.py
tests/models/language/pooling/test_qwen3_reranker.py
+0
-6
vllm/config.py
vllm/config.py
+8
-0
vllm/model_executor/models/bert_with_rope.py
vllm/model_executor/models/bert_with_rope.py
+1
-4
vllm/model_executor/models/config.py
vllm/model_executor/models/config.py
+1
-1
vllm/model_executor/models/modernbert.py
vllm/model_executor/models/modernbert.py
+0
-2
No files found.
tests/models/language/pooling/test_classification.py
View file @
2836dd73
...
...
@@ -6,14 +6,6 @@ from transformers import AutoModelForSequenceClassification
from
vllm.platforms
import
current_platform
# TODO: enable when float32 is supported by V1
# @pytest.fixture(autouse=True)
# def v1(run_with_both_engines):
# # Simple autouse wrapper to run both engines for each test
# # This can be promoted up to conftest.py to run for every
# # test in a package
# pass
@
pytest
.
mark
.
parametrize
(
"model"
,
...
...
tests/models/language/pooling/test_gte.py
View file @
2836dd73
...
...
@@ -56,17 +56,10 @@ MODELS = [
enable_test
=
False
),
]
V1FlashAttentionImpNotSupported
=
[
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
,
"Alibaba-NLP/gte-modernbert-base"
]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
,
monkeypatch
)
->
None
:
if
model_info
.
name
in
V1FlashAttentionImpNotSupported
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
vllm_extra_kwargs
:
dict
[
str
,
Any
]
=
{}
if
model_info
.
architecture
==
"GteNewModel"
:
vllm_extra_kwargs
[
"hf_overrides"
]
=
{
"architectures"
:
[
"GteNewModel"
]}
...
...
@@ -77,11 +70,8 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_embed_models_correctness
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
,
example_prompts
,
monkeypatch
)
->
None
:
if
model_info
.
name
in
V1FlashAttentionImpNotSupported
:
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
model_info
:
EmbedModelInfo
,
example_prompts
)
->
None
:
vllm_extra_kwargs
:
dict
[
str
,
Any
]
=
{}
if
model_info
.
architecture
==
"GteNewModel"
:
vllm_extra_kwargs
[
"hf_overrides"
]
=
{
"architectures"
:
[
"GteNewModel"
]}
...
...
tests/models/language/pooling/test_jina.py
View file @
2836dd73
...
...
@@ -4,7 +4,6 @@ from functools import partial
import
pytest
import
vllm.envs
as
envs
from
vllm
import
PoolingParams
from
...utils
import
EmbedModelInfo
,
RerankModelInfo
...
...
@@ -24,14 +23,6 @@ RERANK_MODELS = [
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
mark
.
parametrize
(
"model_info"
,
EMBEDDING_MODELS
)
def
test_embed_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
)
->
None
:
...
...
@@ -63,10 +54,6 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
@
pytest
.
mark
.
parametrize
(
"model_info"
,
RERANK_MODELS
)
def
test_rerank_models_mteb
(
hf_runner
,
vllm_runner
,
model_info
:
RerankModelInfo
)
->
None
:
if
(
model_info
.
architecture
==
"XLMRobertaForSequenceClassification"
and
envs
.
VLLM_USE_V1
):
pytest
.
skip
(
"Not supported yet"
)
mteb_test_rerank_models
(
hf_runner
,
vllm_runner
,
model_info
)
...
...
tests/models/language/pooling/test_qwen3_reranker.py
View file @
2836dd73
...
...
@@ -83,9 +83,6 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
}
}
if
model_info
.
name
==
"Qwen/Qwen3-Reranker-4B"
:
vllm_extra_kwargs
[
"max_num_seqs"
]
=
1
mteb_test_rerank_models
(
Qwen3RerankerHfRunner
,
vllm_runner
,
model_info
,
vllm_extra_kwargs
)
...
...
@@ -106,9 +103,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
"tensor_parallel_size"
:
2
,
}
if
model_info
.
name
==
"Qwen/Qwen3-Reranker-4B"
:
vllm_extra_kwargs
[
"max_num_seqs"
]
=
1
mteb_test_rerank_models
(
Qwen3RerankerHfRunner
,
vllm_runner
,
model_info
,
...
...
vllm/config.py
View file @
2836dd73
...
...
@@ -776,6 +776,9 @@ class ModelConfig:
raise
ValueError
(
"`override_neuron_config` is only supported on Neuron."
)
# Avoid running try_verify_and_update_config multiple times
self
.
config_updated
=
False
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
self
.
_verify_bnb_config
()
...
...
@@ -4914,6 +4917,11 @@ class VllmConfig:
if
self
.
model_config
is
None
:
return
# Avoid running try_verify_and_update_config multiple times
if
getattr
(
self
.
model_config
,
"config_updated"
,
False
):
return
self
.
model_config
.
config_updated
=
True
architecture
=
self
.
model_config
.
architecture
if
architecture
is
None
:
return
...
...
vllm/model_executor/models/bert_with_rope.py
View file @
2836dd73
...
...
@@ -8,7 +8,6 @@ from torch import nn
from
transformers
import
PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionType
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
divide
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
...
...
@@ -26,7 +25,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models
import
SupportsV0Only
from
vllm.model_executor.models.interfaces
import
SupportsQuant
from
vllm.model_executor.models.utils
import
WeightsMapper
from
vllm.model_executor.utils
import
set_weight_attrs
...
...
@@ -360,7 +358,6 @@ class BertWithRopeBlock(nn.Module):
return
hidden_states
@
support_torch_compile
class
BertWithRopeEncoder
(
nn
.
Module
):
def
__init__
(
self
,
...
...
@@ -394,7 +391,7 @@ class BertWithRopeEncoder(nn.Module):
return
hidden_states
class
BertWithRope
(
nn
.
Module
,
SupportsV0Only
,
SupportsQuant
):
class
BertWithRope
(
nn
.
Module
,
SupportsQuant
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
""
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
...
...
vllm/model_executor/models/config.py
View file @
2836dd73
...
...
@@ -93,7 +93,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
config
.
num_hidden_layers
=
config
.
n_layer
head_dim
=
config
.
hidden_size
//
config
.
num_attention_heads
rotary_emb_dim
=
head_dim
*
config
.
rotary_emb_fraction
rotary_emb_dim
=
int
(
head_dim
*
config
.
rotary_emb_fraction
)
max_trained_positions
=
getattr
(
config
,
"max_trained_positions"
,
2048
)
config
.
rotary_kwargs
=
{
"head_size"
:
head_dim
,
...
...
vllm/model_executor/models/modernbert.py
View file @
2836dd73
...
...
@@ -8,7 +8,6 @@ from torch import nn
from
transformers
import
ModernBertConfig
from
vllm.attention
import
Attention
,
AttentionType
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.linear
import
(
QKVParallelLinear
,
...
...
@@ -200,7 +199,6 @@ class ModernBertEncoderLayer(nn.Module):
return
hidden_states
@
support_torch_compile
class
ModernBertModel
(
nn
.
Module
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"layers."
:
"encoder_layer.layers."
})
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment