Unverified Commit 1adeb3b8 authored by 汪志鹏's avatar 汪志鹏 Committed by GitHub
Browse files

[New Model] BAGEL support (AR only) (#28439)


Signed-off-by: default avatarprincepride <wangzhipeng628@gmail.com>
Signed-off-by: default avatar汪志鹏 <wangzhipeng628@gmail.com>
Co-authored-by: default avatarCyrus Leung <tlleungac@connect.ust.hk>
parent e3a1cd1c
......@@ -661,6 +661,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
......
......@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
)
def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "ByteDance-Seed/BAGEL-7B-MoT"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
)
prompts = [
(
f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_example_map = {
"aria": run_aria,
"aya_vision": run_aya_vision,
"bagel": run_bagel,
"bee": run_bee,
"blip-2": run_blip2,
"chameleon": run_chameleon,
......
......@@ -582,6 +582,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
"BeeForConditionalGeneration": _HfExamplesInfo(
"Open-Bee/Bee-8B-RL",
trust_remote_code=True,
......
This diff is collapsed.
......@@ -122,6 +122,8 @@ class Qwen2Attention(nn.Module):
prefix: str = "",
attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None,
qk_norm: bool = False,
rms_norm_eps: float = 1e-6,
) -> None:
super().__init__()
self.hidden_size = hidden_size
......@@ -144,6 +146,7 @@ class Qwen2Attention(nn.Module):
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.dual_chunk_attention_config = dual_chunk_attention_config
self.qk_norm = qk_norm
self.qkv_proj = QKVParallelLinear(
hidden_size,
......@@ -162,6 +165,11 @@ class Qwen2Attention(nn.Module):
prefix=f"{prefix}.o_proj",
)
# QK Normalization support (used in BAGEL and some other models)
if self.qk_norm:
self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
self.rotary_emb = get_rope(
self.head_dim,
max_position=max_position,
......@@ -197,6 +205,23 @@ class Qwen2Attention(nn.Module):
) -> torch.Tensor:
qkv, _ = self.qkv_proj(hidden_states)
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
# Apply QK normalization if enabled (before RoPE)
if self.qk_norm:
# Reshape to apply per-head normalization
# q shape: (total_tokens, q_size) -> (total_tokens, num_heads, head_dim)
total_tokens = q.shape[0]
q = q.view(total_tokens, self.num_heads, self.head_dim)
k = k.view(total_tokens, self.num_kv_heads, self.head_dim)
# Apply normalization
q = self.q_norm(q)
k = self.k_norm(k)
# Reshape back
q = q.view(total_tokens, self.q_size)
k = k.view(total_tokens, self.kv_size)
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output)
......@@ -227,6 +252,9 @@ class Qwen2DecoderLayer(nn.Module):
else:
attn_type = AttentionType.ENCODER_ONLY
# Check if QK normalization is enabled (used in BAGEL and some other models)
qk_norm = getattr(config, "qk_norm", False)
self.self_attn = Qwen2Attention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
......@@ -238,6 +266,8 @@ class Qwen2DecoderLayer(nn.Module):
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config,
qk_norm=qk_norm,
rms_norm_eps=config.rms_norm_eps,
)
self.mlp = Qwen2MLP(
hidden_size=self.hidden_size,
......@@ -480,6 +510,8 @@ class Qwen2Model(nn.Module):
continue
if is_pp_missing_parameter(name, self):
continue
if name not in params_dict:
continue
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
......
......@@ -272,6 +272,7 @@ _MULTIMODAL_MODELS = {
"aya_vision",
"AyaVisionForConditionalGeneration",
),
"BagelForConditionalGeneration": ("bagel", "BagelForConditionalGeneration"),
"BeeForConditionalGeneration": ("bee", "BeeForConditionalGeneration"),
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
"ChameleonForConditionalGeneration": (
......
......@@ -66,6 +66,7 @@ class LazyConfigDict(dict):
_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
afmoe="AfmoeConfig",
bagel="BagelConfig",
chatglm="ChatGLMConfig",
deepseek_vl_v2="DeepseekVLV2Config",
deepseek_v32="DeepseekV3Config",
......
......@@ -16,6 +16,7 @@ import importlib
_CLASS_TO_MODULE: dict[str, str] = {
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
"BagelConfig": "vllm.transformers_utils.configs.bagel",
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
"DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
......@@ -54,6 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
__all__ = [
"AfmoeConfig",
"BagelConfig",
"ChatGLMConfig",
"DeepseekVLV2Config",
"DeepseekV3Config",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import PretrainedConfig, SiglipVisionConfig
from transformers.models.qwen2 import Qwen2Config
class BagelConfig(PretrainedConfig):
"""Configuration class for BAGEL model."""
model_type = "bagel"
def __init__(
self,
visual_gen: bool = True,
visual_und: bool = True,
llm_config: dict | Qwen2Config | None = None,
vit_config: dict | SiglipVisionConfig | None = None,
vae_config: dict | None = None,
latent_patch_size: int = 2,
max_latent_size: int = 32,
vit_max_num_patch_per_side: int = 70,
connector_act: str = "gelu_pytorch_tanh",
interpolate_pos: bool = False,
timestep_shift: float = 1.0,
**kwargs,
):
super().__init__(**kwargs)
self.visual_gen = visual_gen
self.visual_und = visual_und
# Convert dict configs to proper config objects
if isinstance(llm_config, dict):
self.llm_config = Qwen2Config(**llm_config)
else:
self.llm_config = llm_config or Qwen2Config()
if isinstance(vit_config, dict):
self.vit_config = SiglipVisionConfig(**vit_config)
else:
self.vit_config = vit_config or SiglipVisionConfig()
self.vae_config = vae_config or {"z_channels": 16, "downsample": 8}
self.latent_patch_size = latent_patch_size
self.max_latent_size = max_latent_size
self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
self.connector_act = connector_act
self.interpolate_pos = interpolate_pos
self.timestep_shift = timestep_shift
@property
def hidden_size(self) -> int:
"""Return the hidden size of the language model."""
return self.llm_config.hidden_size
......@@ -8,6 +8,7 @@ reasons:
- There is a need to override the existing processor to support vLLM.
"""
from vllm.transformers_utils.processors.bagel import BagelProcessor
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
......@@ -15,6 +16,7 @@ from vllm.transformers_utils.processors.ovis import OvisProcessor
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
__all__ = [
"BagelProcessor",
"DeepseekVLV2Processor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
"""BAGEL processor for image and text inputs."""
from transformers import AutoProcessor
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
class BagelProcessor(ProcessorMixin):
"""
Constructs a BAGEL processor which wraps a
SigLIP image processor and a Qwen2 tokenizer.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "SiglipImageProcessor"
tokenizer_class = "AutoTokenizer"
def __call__(
self,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
images: ImageInput = None,
**kwargs,
):
"""
Main method to prepare for the model one or several sequences(s) and image(s).
"""
if images is not None:
# Process images with the image processor
# Ensure return_tensors is set to "pt" for PyTorch tensors
image_kwargs = {**kwargs}
if "return_tensors" not in image_kwargs:
image_kwargs["return_tensors"] = "pt"
pixel_values = self.image_processor(images, **image_kwargs)
else:
pixel_values = None
text_inputs = self.tokenizer(text, **kwargs) if text is not None else None
if pixel_values is not None and text_inputs is not None:
text_inputs["pixel_values"] = pixel_values["pixel_values"]
return text_inputs
elif pixel_values is not None:
return pixel_values
else:
return text_inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's decode.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
AutoProcessor.register("BagelProcessor", BagelProcessor)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment