Unverified Commit 2836dd73 authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Model][CI] Let more pooling models support v1 (#21747)


Signed-off-by: default avatarwang.yuqi <noooop@126.com>
parent d2aab336
...@@ -6,14 +6,6 @@ from transformers import AutoModelForSequenceClassification ...@@ -6,14 +6,6 @@ from transformers import AutoModelForSequenceClassification
from vllm.platforms import current_platform from vllm.platforms import current_platform
# TODO: enable when float32 is supported by V1
# @pytest.fixture(autouse=True)
# def v1(run_with_both_engines):
# # Simple autouse wrapper to run both engines for each test
# # This can be promoted up to conftest.py to run for every
# # test in a package
# pass
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", "model",
......
...@@ -56,17 +56,10 @@ MODELS = [ ...@@ -56,17 +56,10 @@ MODELS = [
enable_test=False), enable_test=False),
] ]
V1FlashAttentionImpNotSupported = [
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
]
@pytest.mark.parametrize("model_info", MODELS) @pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo, def test_embed_models_mteb(hf_runner, vllm_runner,
monkeypatch) -> None: model_info: EmbedModelInfo) -> None:
if model_info.name in V1FlashAttentionImpNotSupported:
monkeypatch.setenv("VLLM_USE_V1", "0")
vllm_extra_kwargs: dict[str, Any] = {} vllm_extra_kwargs: dict[str, Any] = {}
if model_info.architecture == "GteNewModel": if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
...@@ -77,11 +70,8 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo, ...@@ -77,11 +70,8 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
@pytest.mark.parametrize("model_info", MODELS) @pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner, def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo, example_prompts, model_info: EmbedModelInfo,
monkeypatch) -> None: example_prompts) -> None:
if model_info.name in V1FlashAttentionImpNotSupported:
monkeypatch.setenv("VLLM_USE_V1", "0")
vllm_extra_kwargs: dict[str, Any] = {} vllm_extra_kwargs: dict[str, Any] = {}
if model_info.architecture == "GteNewModel": if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
......
...@@ -4,7 +4,6 @@ from functools import partial ...@@ -4,7 +4,6 @@ from functools import partial
import pytest import pytest
import vllm.envs as envs
from vllm import PoolingParams from vllm import PoolingParams
from ...utils import EmbedModelInfo, RerankModelInfo from ...utils import EmbedModelInfo, RerankModelInfo
...@@ -24,14 +23,6 @@ RERANK_MODELS = [ ...@@ -24,14 +23,6 @@ RERANK_MODELS = [
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None: model_info: EmbedModelInfo) -> None:
...@@ -63,10 +54,6 @@ def test_embed_models_correctness(hf_runner, vllm_runner, ...@@ -63,10 +54,6 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner, def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None: model_info: RerankModelInfo) -> None:
if (model_info.architecture == "XLMRobertaForSequenceClassification"
and envs.VLLM_USE_V1):
pytest.skip("Not supported yet")
mteb_test_rerank_models(hf_runner, vllm_runner, model_info) mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
......
...@@ -83,9 +83,6 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: ...@@ -83,9 +83,6 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
} }
} }
if model_info.name == "Qwen/Qwen3-Reranker-4B":
vllm_extra_kwargs["max_num_seqs"] = 1
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info, mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
vllm_extra_kwargs) vllm_extra_kwargs)
...@@ -106,9 +103,6 @@ def test_rerank_models_mteb_tp(vllm_runner, ...@@ -106,9 +103,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
"tensor_parallel_size": 2, "tensor_parallel_size": 2,
} }
if model_info.name == "Qwen/Qwen3-Reranker-4B":
vllm_extra_kwargs["max_num_seqs"] = 1
mteb_test_rerank_models(Qwen3RerankerHfRunner, mteb_test_rerank_models(Qwen3RerankerHfRunner,
vllm_runner, vllm_runner,
model_info, model_info,
......
...@@ -776,6 +776,9 @@ class ModelConfig: ...@@ -776,6 +776,9 @@ class ModelConfig:
raise ValueError( raise ValueError(
"`override_neuron_config` is only supported on Neuron.") "`override_neuron_config` is only supported on Neuron.")
# Avoid running try_verify_and_update_config multiple times
self.config_updated = False
self._verify_quantization() self._verify_quantization()
self._verify_cuda_graph() self._verify_cuda_graph()
self._verify_bnb_config() self._verify_bnb_config()
...@@ -4914,6 +4917,11 @@ class VllmConfig: ...@@ -4914,6 +4917,11 @@ class VllmConfig:
if self.model_config is None: if self.model_config is None:
return return
# Avoid running try_verify_and_update_config multiple times
if getattr(self.model_config, "config_updated", False):
return
self.model_config.config_updated = True
architecture = self.model_config.architecture architecture = self.model_config.architecture
if architecture is None: if architecture is None:
return return
......
...@@ -8,7 +8,6 @@ from torch import nn ...@@ -8,7 +8,6 @@ from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionType from vllm.attention import Attention, AttentionType
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (divide, get_tensor_model_parallel_rank, from vllm.distributed import (divide, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
...@@ -26,7 +25,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope ...@@ -26,7 +25,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding) VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models import SupportsV0Only
from vllm.model_executor.models.interfaces import SupportsQuant from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.models.utils import WeightsMapper
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
...@@ -360,7 +358,6 @@ class BertWithRopeBlock(nn.Module): ...@@ -360,7 +358,6 @@ class BertWithRopeBlock(nn.Module):
return hidden_states return hidden_states
@support_torch_compile
class BertWithRopeEncoder(nn.Module): class BertWithRopeEncoder(nn.Module):
def __init__(self, def __init__(self,
...@@ -394,7 +391,7 @@ class BertWithRopeEncoder(nn.Module): ...@@ -394,7 +391,7 @@ class BertWithRopeEncoder(nn.Module):
return hidden_states return hidden_states
class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant): class BertWithRope(nn.Module, SupportsQuant):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
......
...@@ -93,7 +93,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): ...@@ -93,7 +93,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
config.num_hidden_layers = config.n_layer config.num_hidden_layers = config.n_layer
head_dim = config.hidden_size // config.num_attention_heads head_dim = config.hidden_size // config.num_attention_heads
rotary_emb_dim = head_dim * config.rotary_emb_fraction rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
max_trained_positions = getattr(config, "max_trained_positions", 2048) max_trained_positions = getattr(config, "max_trained_positions", 2048)
config.rotary_kwargs = { config.rotary_kwargs = {
"head_size": head_dim, "head_size": head_dim,
......
...@@ -8,7 +8,6 @@ from torch import nn ...@@ -8,7 +8,6 @@ from torch import nn
from transformers import ModernBertConfig from transformers import ModernBertConfig
from vllm.attention import Attention, AttentionType from vllm.attention import Attention, AttentionType
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.linear import (QKVParallelLinear, from vllm.model_executor.layers.linear import (QKVParallelLinear,
...@@ -200,7 +199,6 @@ class ModernBertEncoderLayer(nn.Module): ...@@ -200,7 +199,6 @@ class ModernBertEncoderLayer(nn.Module):
return hidden_states return hidden_states
@support_torch_compile
class ModernBertModel(nn.Module): class ModernBertModel(nn.Module):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"layers.": "encoder_layer.layers."}) orig_to_new_prefix={"layers.": "encoder_layer.layers."})
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment