"benchmarks/vscode:/vscode.git/clone" did not exist on "e23564cb703916efef20d80fd1c32dd76dee0979"
Unverified Commit c0817e4d authored by bsliu's avatar bsliu Committed by GitHub
Browse files

[Model] Add support for Cheers multimodal model (#38788)


Signed-off-by: default avatarbsliu <1187291748@qq.com>
Signed-off-by: default avatar吴炳贤 <wubingxian24@mails.ucas.ac.cn>
parent dfe5e316
...@@ -541,6 +541,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen ...@@ -541,6 +541,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ | | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ | | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ |
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
| `CheersForConditionalGeneration` | Cheers | T + I | `ai9stars/Cheers` | | ✅︎ |
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
| `DeepseekVLV2ForCausalLM` | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | | `DeepseekVLV2ForCausalLM` | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ | | `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ |
......
...@@ -179,6 +179,33 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: ...@@ -179,6 +179,33 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
) )
# Cheers
def run_cheers(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "ai9stars/Cheers"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={modality: 1},
)
prompts = [
(
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|image_pad|>{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData: def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -2140,6 +2167,7 @@ model_example_map = { ...@@ -2140,6 +2167,7 @@ model_example_map = {
"aria": run_aria, "aria": run_aria,
"aya_vision": run_aya_vision, "aya_vision": run_aya_vision,
"bagel": run_bagel, "bagel": run_bagel,
"cheers": run_cheers,
"bee": run_bee, "bee": run_bee,
"blip-2": run_blip2, "blip-2": run_blip2,
"chameleon": run_chameleon, "chameleon": run_chameleon,
......
...@@ -766,6 +766,14 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -766,6 +766,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"6b": "Salesforce/blip2-opt-6.7b"}, extras={"6b": "Salesforce/blip2-opt-6.7b"},
), ),
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),
"Cheers": _HfExamplesInfo(
"ai9stars/Cheers",
trust_remote_code=True,
),
"CheersForConditionalGeneration": _HfExamplesInfo(
"ai9stars/Cheers",
trust_remote_code=True,
),
"Cohere2VisionForConditionalGeneration": _HfExamplesInfo( "Cohere2VisionForConditionalGeneration": _HfExamplesInfo(
"CohereLabs/command-a-vision-07-2025" "CohereLabs/command-a-vision-07-2025"
), ),
......
...@@ -1203,6 +1203,7 @@ class ModelConfig: ...@@ -1203,6 +1203,7 @@ class ModelConfig:
"gemma3", "gemma3",
"molmo2", "molmo2",
"paligemma", "paligemma",
"umm",
) )
if not hasattr(self.hf_config, "model_type"): if not hasattr(self.hf_config, "model_type"):
return False return False
......
This diff is collapsed.
...@@ -350,6 +350,8 @@ _MULTIMODAL_MODELS = { ...@@ -350,6 +350,8 @@ _MULTIMODAL_MODELS = {
"chameleon", "chameleon",
"ChameleonForConditionalGeneration", "ChameleonForConditionalGeneration",
), ),
"Cheers": ("cheers", "CheersForConditionalGeneration"),
"CheersForConditionalGeneration": ("cheers", "CheersForConditionalGeneration"),
"Cohere2VisionForConditionalGeneration": ( "Cohere2VisionForConditionalGeneration": (
"cohere2_vision", "cohere2_vision",
"Cohere2VisionForConditionalGeneration", "Cohere2VisionForConditionalGeneration",
......
...@@ -80,6 +80,7 @@ class LazyConfigDict(dict): ...@@ -80,6 +80,7 @@ class LazyConfigDict(dict):
_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
afmoe="AfmoeConfig", afmoe="AfmoeConfig",
bagel="BagelConfig", bagel="BagelConfig",
umm="CheersConfig",
chatglm="ChatGLMConfig", chatglm="ChatGLMConfig",
colmodernvbert="ColModernVBertConfig", colmodernvbert="ColModernVBertConfig",
colpali="ColPaliConfig", colpali="ColPaliConfig",
......
...@@ -18,6 +18,7 @@ _CLASS_TO_MODULE: dict[str, str] = { ...@@ -18,6 +18,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe", "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
"AXK1Config": "vllm.transformers_utils.configs.AXK1", "AXK1Config": "vllm.transformers_utils.configs.AXK1",
"BagelConfig": "vllm.transformers_utils.configs.bagel", "BagelConfig": "vllm.transformers_utils.configs.bagel",
"CheersConfig": "vllm.transformers_utils.configs.cheers",
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm", "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
"ColModernVBertConfig": "vllm.transformers_utils.configs.colmodernvbert", "ColModernVBertConfig": "vllm.transformers_utils.configs.colmodernvbert",
"ColPaliConfig": "vllm.transformers_utils.configs.colpali", "ColPaliConfig": "vllm.transformers_utils.configs.colpali",
...@@ -75,6 +76,7 @@ __all__ = [ ...@@ -75,6 +76,7 @@ __all__ = [
"AfmoeConfig", "AfmoeConfig",
"AXK1Config", "AXK1Config",
"BagelConfig", "BagelConfig",
"CheersConfig",
"ChatGLMConfig", "ChatGLMConfig",
"ColModernVBertConfig", "ColModernVBertConfig",
"ColPaliConfig", "ColPaliConfig",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import PretrainedConfig, SiglipVisionConfig
from transformers.modeling_rope_utils import rope_config_validation
class CheersTextConfig(PretrainedConfig):
"""Qwen2-based text config with Cheers-specific defaults."""
model_type = "umm"
base_config_key = "text_config"
def __init__(
self,
vocab_size=152064,
hidden_size=3584,
intermediate_size=18944,
num_hidden_layers=28,
num_attention_heads=28,
num_key_value_heads=4,
hidden_act="silu",
max_position_embeddings=131072,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
tie_word_embeddings=False,
rope_theta=1000000.0,
rope_scaling=None,
use_sliding_window=False,
sliding_window=131072,
max_window_layers=28,
layer_types=None,
attention_dropout=0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.use_sliding_window = use_sliding_window
self.sliding_window = sliding_window if self.use_sliding_window else None
self.max_window_layers = max_window_layers
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_dropout = attention_dropout
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)
self.layer_types = layer_types
if self.layer_types is None:
self.layer_types = [
"sliding_attention"
if self.sliding_window is not None and i >= self.max_window_layers
else "full_attention"
for i in range(self.num_hidden_layers)
]
super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
class CheersConfig(PretrainedConfig):
"""Configuration class for Cheers (UMM) model."""
model_type = "umm"
def __init__(
self,
text_config: dict | CheersTextConfig | None = None,
vision_representation_config: dict | SiglipVisionConfig | None = None,
vae_encoder_config: dict | None = None,
vae_decoder_config: dict | None = None,
**kwargs,
):
super().__init__(**kwargs)
if isinstance(text_config, dict):
self.text_config = CheersTextConfig(**text_config)
else:
self.text_config = text_config or CheersTextConfig()
if isinstance(vision_representation_config, dict):
self.vision_representation_config = SiglipVisionConfig(
**vision_representation_config
)
else:
self.vision_representation_config = (
vision_representation_config or SiglipVisionConfig()
)
self.vae_encoder_config = vae_encoder_config or {"resolution": 512}
self.vae_decoder_config = vae_decoder_config or {"resolution": 512}
@property
def hidden_size(self) -> int:
"""Return the hidden size of the language model."""
return self.text_config.hidden_size
...@@ -12,6 +12,7 @@ import importlib ...@@ -12,6 +12,7 @@ import importlib
__all__ = [ __all__ = [
"BagelProcessor", "BagelProcessor",
"CheersProcessor",
"CohereASRProcessor", "CohereASRProcessor",
"DeepseekVLV2Processor", "DeepseekVLV2Processor",
"FireRedASR2Processor", "FireRedASR2Processor",
...@@ -39,6 +40,7 @@ __all__ = [ ...@@ -39,6 +40,7 @@ __all__ = [
_CLASS_TO_MODULE: dict[str, str] = { _CLASS_TO_MODULE: dict[str, str] = {
"BagelProcessor": "vllm.transformers_utils.processors.bagel", "BagelProcessor": "vllm.transformers_utils.processors.bagel",
"CheersProcessor": "vllm.transformers_utils.processors.cheers",
"CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr", "CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2", "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2", "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Cheers (UMM) processor for image and text inputs."""
from transformers import AutoProcessor
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
class CheersProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"images_kwargs": {
"return_tensors": "pt",
},
}
class CheersProcessor(ProcessorMixin):
"""
Constructs a Cheers processor which wraps a
SigLIP image processor and a Qwen2 tokenizer.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __call__(
self,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
images: ImageInput = None,
**kwargs: Unpack[CheersProcessorKwargs],
):
output_kwargs = self._merge_kwargs(
CheersProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if images is not None:
import torch
if isinstance(images, (list, tuple)):
all_pv = []
all_ghw = []
for img in images:
result = self.image_processor(img, **output_kwargs["images_kwargs"])
all_pv.append(result["pixel_values"])
if "grid_hws" in result:
all_ghw.append(result["grid_hws"])
pixel_values = {
"pixel_values": torch.cat(all_pv, dim=0),
}
if all_ghw:
pixel_values["grid_hws"] = torch.cat(all_ghw, dim=0)
else:
pixel_values = self.image_processor(
images, **output_kwargs["images_kwargs"]
)
else:
pixel_values = {}
text_inputs = (
self.tokenizer(text, **output_kwargs["text_kwargs"])
if text is not None
else {}
)
return BatchFeature(data={**pixel_values, **text_inputs})
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
AutoProcessor.register("CheersProcessor", CheersProcessor)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment