[New Model] BAGEL support (AR only) (#28439)

Signed-off-by: princepride <wangzhipeng628@gmail.com> Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>

[New Model] BAGEL support (AR only) (#28439)
Signed-off-by: princepride <wangzhipeng628@gmail.com> Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
1adeb3b8 · 汪志鹏 · GitHub · e3a1cd1c · 1adeb3b8 · 1adeb3b8
Unverified Commit 1adeb3b8 authored Dec 15, 2025 by 汪志鹏 Committed by GitHub Dec 15, 2025
11 changed files
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -661,6 +661,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
 | `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
+| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
    )


+def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "ByteDance-Seed/BAGEL-7B-MoT"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
+            f"<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 model_example_map = {
    "aria": run_aria,
    "aya_vision": run_aya_vision,
+    "bagel": run_bagel,
    "bee": run_bee,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -582,6 +582,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
+    "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
    "BeeForConditionalGeneration": _HfExamplesInfo(
        "Open-Bee/Bee-8B-RL",
        trust_remote_code=True,

--- a/vllm/model_executor/models/bagel.py
+++ b/vllm/model_executor/models/bagel.py
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -122,6 +122,8 @@ class Qwen2Attention(nn.Module):
        prefix: str = "",
        attn_type: str = AttentionType.DECODER,
        dual_chunk_attention_config: dict[str, Any] | None = None,
+        qk_norm: bool = False,
+        rms_norm_eps: float = 1e-6,
    ) -> None:
        super().__init__()
        self.hidden_size = hidden_size
@@ -144,6 +146,7 @@ class Qwen2Attention(nn.Module):
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
        self.dual_chunk_attention_config = dual_chunk_attention_config
+        self.qk_norm = qk_norm

        self.qkv_proj = QKVParallelLinear(
            hidden_size,
@@ -162,6 +165,11 @@ class Qwen2Attention(nn.Module):
            prefix=f"{prefix}.o_proj",
        )

+        # QK Normalization support (used in BAGEL and some other models)
+        if self.qk_norm:
+            self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+            self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
        self.rotary_emb = get_rope(
            self.head_dim,
            max_position=max_position,
@@ -197,6 +205,23 @@ class Qwen2Attention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply QK normalization if enabled (before RoPE)
+        if self.qk_norm:
+            # Reshape to apply per-head normalization
+            # q shape: (total_tokens, q_size) -> (total_tokens, num_heads, head_dim)
+            total_tokens = q.shape[0]
+            q = q.view(total_tokens, self.num_heads, self.head_dim)
+            k = k.view(total_tokens, self.num_kv_heads, self.head_dim)
+
+            # Apply normalization
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+
+            # Reshape back
+            q = q.view(total_tokens, self.q_size)
+            k = k.view(total_tokens, self.kv_size)
+
        q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v)
        output, _ = self.o_proj(attn_output)
@@ -227,6 +252,9 @@ class Qwen2DecoderLayer(nn.Module):
        else:
            attn_type = AttentionType.ENCODER_ONLY

+        # Check if QK normalization is enabled (used in BAGEL and some other models)
+        qk_norm = getattr(config, "qk_norm", False)
+
        self.self_attn = Qwen2Attention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
@@ -238,6 +266,8 @@ class Qwen2DecoderLayer(nn.Module):
            prefix=f"{prefix}.self_attn",
            attn_type=attn_type,
            dual_chunk_attention_config=dual_chunk_attention_config,
+            qk_norm=qk_norm,
+            rms_norm_eps=config.rms_norm_eps,
        )
        self.mlp = Qwen2MLP(
            hidden_size=self.hidden_size,
@@ -480,6 +510,8 @@ class Qwen2Model(nn.Module):
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
+                if name not in params_dict:
+                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                weight_loader(param, loaded_weight)

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -272,6 +272,7 @@ _MULTIMODAL_MODELS = {
        "aya_vision",
        "AyaVisionForConditionalGeneration",
    ),
+    "BagelForConditionalGeneration": ("bagel", "BagelForConditionalGeneration"),
    "BeeForConditionalGeneration": ("bee", "BeeForConditionalGeneration"),
    "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
    "ChameleonForConditionalGeneration": (

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -66,6 +66,7 @@ class LazyConfigDict(dict):

 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
    afmoe="AfmoeConfig",
+    bagel="BagelConfig",
    chatglm="ChatGLMConfig",
    deepseek_vl_v2="DeepseekVLV2Config",
    deepseek_v32="DeepseekV3Config",

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -16,6 +16,7 @@ import importlib

 _CLASS_TO_MODULE: dict[str, str] = {
    "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
+    "BagelConfig": "vllm.transformers_utils.configs.bagel",
    "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
    "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
    "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
@@ -54,6 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = {

 __all__ = [
    "AfmoeConfig",
+    "BagelConfig",
    "ChatGLMConfig",
    "DeepseekVLV2Config",
    "DeepseekV3Config",

--- a/vllm/transformers_utils/configs/bagel.py
+++ b/vllm/transformers_utils/configs/bagel.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import PretrainedConfig, SiglipVisionConfig
+from transformers.models.qwen2 import Qwen2Config
+
+
+class BagelConfig(PretrainedConfig):
+    """Configuration class for BAGEL model."""
+
+    model_type = "bagel"
+
+    def __init__(
+        self,
+        visual_gen: bool = True,
+        visual_und: bool = True,
+        llm_config: dict | Qwen2Config | None = None,
+        vit_config: dict | SiglipVisionConfig | None = None,
+        vae_config: dict | None = None,
+        latent_patch_size: int = 2,
+        max_latent_size: int = 32,
+        vit_max_num_patch_per_side: int = 70,
+        connector_act: str = "gelu_pytorch_tanh",
+        interpolate_pos: bool = False,
+        timestep_shift: float = 1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.visual_gen = visual_gen
+        self.visual_und = visual_und
+
+        # Convert dict configs to proper config objects
+        if isinstance(llm_config, dict):
+            self.llm_config = Qwen2Config(**llm_config)
+        else:
+            self.llm_config = llm_config or Qwen2Config()
+
+        if isinstance(vit_config, dict):
+            self.vit_config = SiglipVisionConfig(**vit_config)
+        else:
+            self.vit_config = vit_config or SiglipVisionConfig()
+
+        self.vae_config = vae_config or {"z_channels": 16, "downsample": 8}
+        self.latent_patch_size = latent_patch_size
+        self.max_latent_size = max_latent_size
+        self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
+        self.connector_act = connector_act
+        self.interpolate_pos = interpolate_pos
+        self.timestep_shift = timestep_shift
+
+    @property
+    def hidden_size(self) -> int:
+        """Return the hidden size of the language model."""
+        return self.llm_config.hidden_size
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -8,6 +8,7 @@ reasons:
 - There is a need to override the existing processor to support vLLM.
 """

+from vllm.transformers_utils.processors.bagel import BagelProcessor
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
 from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
@@ -15,6 +16,7 @@ from vllm.transformers_utils.processors.ovis import OvisProcessor
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor

 __all__ = [
+    "BagelProcessor",
    "DeepseekVLV2Processor",
    "HunYuanVLProcessor",
    "HunYuanVLImageProcessor",

--- a/vllm/transformers_utils/processors/bagel.py
+++ b/vllm/transformers_utils/processors/bagel.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+"""BAGEL processor for image and text inputs."""
+
+from transformers import AutoProcessor
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class BagelProcessor(ProcessorMixin):
+    """
+    Constructs a BAGEL processor which wraps a
+    SigLIP image processor and a Qwen2 tokenizer.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __call__(
+        self,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        images: ImageInput = None,
+        **kwargs,
+    ):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s).
+        """
+        if images is not None:
+            # Process images with the image processor
+            # Ensure return_tensors is set to "pt" for PyTorch tensors
+            image_kwargs = {**kwargs}
+            if "return_tensors" not in image_kwargs:
+                image_kwargs["return_tensors"] = "pt"
+            pixel_values = self.image_processor(images, **image_kwargs)
+        else:
+            pixel_values = None
+
+        text_inputs = self.tokenizer(text, **kwargs) if text is not None else None
+
+        if pixel_values is not None and text_inputs is not None:
+            text_inputs["pixel_values"] = pixel_values["pixel_values"]
+            return text_inputs
+        elif pixel_values is not None:
+            return pixel_values
+        else:
+            return text_inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's decode.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+AutoProcessor.register("BagelProcessor", BagelProcessor)