Unverified Commit 0ff05e37 authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Bugfix] Fix encoder-only model support for transformers backend (#28021)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 428bc7bf
...@@ -899,27 +899,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { ...@@ -899,27 +899,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
_TRANSFORMERS_BACKEND_MODELS = { _TRANSFORMERS_BACKEND_MODELS = {
"TransformersEmbeddingModel": _HfExamplesInfo( "TransformersEmbeddingModel": _HfExamplesInfo(
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0" "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
), ),
"TransformersForSequenceClassification": _HfExamplesInfo( "TransformersForSequenceClassification": _HfExamplesInfo(
"papluca/xlm-roberta-base-language-detection", "papluca/xlm-roberta-base-language-detection",
min_transformers_version="5.0.0", min_transformers_version="5.0.0.dev",
), ),
"TransformersForCausalLM": _HfExamplesInfo( "TransformersForCausalLM": _HfExamplesInfo(
"hmellor/Ilama-3.2-1B", trust_remote_code=True "hmellor/Ilama-3.2-1B", trust_remote_code=True
), ),
"TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"TransformersMoEForCausalLM": _HfExamplesInfo( "TransformersMoEForCausalLM": _HfExamplesInfo(
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0" "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
), ),
"TransformersMultiModalMoEForCausalLM": _HfExamplesInfo( "TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0" "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
), ),
"TransformersMoEEmbeddingModel": _HfExamplesInfo( "TransformersMoEEmbeddingModel": _HfExamplesInfo(
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0" "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
), ),
"TransformersMoEForSequenceClassification": _HfExamplesInfo( "TransformersMoEForSequenceClassification": _HfExamplesInfo(
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0" "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
), ),
"TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"), "TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
"TransformersMultiModalForSequenceClassification": _HfExamplesInfo( "TransformersMultiModalForSequenceClassification": _HfExamplesInfo(
......
...@@ -82,7 +82,7 @@ def test_models( ...@@ -82,7 +82,7 @@ def test_models(
from packaging.version import Version from packaging.version import Version
installed = Version(transformers.__version__) installed = Version(transformers.__version__)
required = Version("5.0.0") required = Version("5.0.0.dev")
if model == "allenai/OLMoE-1B-7B-0924" and installed < required: if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
pytest.skip( pytest.skip(
"MoE models with the Transformers backend require " "MoE models with the Transformers backend require "
......
...@@ -28,6 +28,7 @@ from transformers import AutoModel ...@@ -28,6 +28,7 @@ from transformers import AutoModel
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from vllm.attention import Attention, AttentionType from vllm.attention import Attention, AttentionType
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
from vllm.config.utils import getattr_iter from vllm.config.utils import getattr_iter
from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed import get_pp_group, get_tp_group
from vllm.distributed.utils import get_pp_indices from vllm.distributed.utils import get_pp_indices
...@@ -317,7 +318,7 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): ...@@ -317,7 +318,7 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
# vLLM does not support encoder-decoder models, so if any encoder layer is # vLLM does not support encoder-decoder models, so if any encoder layer is
# found in a text only model, we assume the whole model is an encoder model # found in a text only model, we assume the whole model is an encoder model
if has_encoder(self.model) and not is_multimodal(self.config): if has_encoder(self.model) and not is_multimodal(self.config):
self.check_version("4.57.0.dev0", "encoder models support") self.check_version("5.0.0.dev0", "encoder models support")
attn_type = AttentionType.ENCODER_ONLY attn_type = AttentionType.ENCODER_ONLY
else: else:
attn_type = AttentionType.DECODER attn_type = AttentionType.DECODER
...@@ -336,7 +337,12 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): ...@@ -336,7 +337,12 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
): ):
per_layer_sliding_window = self.config.sliding_window per_layer_sliding_window = self.config.sliding_window
attention_instances[i] = Attention( attn_cls = (
EncoderOnlyAttention
if attn_type == AttentionType.ENCODER_ONLY
else Attention
)
attention_instances[i] = attn_cls(
num_heads=num_heads, num_heads=num_heads,
head_size=head_size, head_size=head_size,
# NOTE: We use Llama scale as default, if it's set by # NOTE: We use Llama scale as default, if it's set by
......
...@@ -115,7 +115,7 @@ direct_register_custom_op( ...@@ -115,7 +115,7 @@ direct_register_custom_op(
class MoEMixin(MixtureOfExperts): class MoEMixin(MixtureOfExperts):
def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""): def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
self.check_version("4.57.0.dev0", "MoE models support") self.check_version("5.0.0.dev0", "MoE models support")
# Skip MixtureOfExperts.__init__ and call the next class in MRO # Skip MixtureOfExperts.__init__ and call the next class in MRO
super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix) super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment