[Model] Add support for Cheers multimodal model (#38788)

Signed-off-by: bsliu <1187291748@qq.com> Signed-off-by: 吴炳贤 <wubingxian24@mails.ucas.ac.cn>

[Model] Add support for Cheers multimodal model (#38788)
Signed-off-by: bsliu <1187291748@qq.com> Signed-off-by: 吴炳贤 <wubingxian24@mails.ucas.ac.cn>
c0817e4d · bsliu · GitHub · dfe5e316 · c0817e4d · c0817e4d
Unverified Commit c0817e4d authored Apr 02, 2026 by bsliu Committed by GitHub Apr 02, 2026
11 changed files
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -541,6 +541,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
+| `CheersForConditionalGeneration` | Cheers | T + I | `ai9stars/Cheers` | | ✅︎ |
 | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
 | `DeepseekVLV2ForCausalLM` | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
 | `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ |

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -179,6 +179,33 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    )
+# Cheers
+def run_cheers(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "ai9stars/Cheers"
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [
+        (
+            f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|image_pad|>{question}<|im_end|>\n"
+            f"<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -2140,6 +2167,7 @@ model_example_map = {
    "aria": run_aria,
    "aya_vision": run_aya_vision,
    "bagel": run_bagel,
+    "cheers": run_cheers,
    "bee": run_bee,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -766,6 +766,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        extras={"6b": "Salesforce/blip2-opt-6.7b"},
    ),
    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),
+    "Cheers": _HfExamplesInfo(
+        "ai9stars/Cheers",
+        trust_remote_code=True,
+    ),
+    "CheersForConditionalGeneration": _HfExamplesInfo(
+        "ai9stars/Cheers",
+        trust_remote_code=True,
+    ),
    "Cohere2VisionForConditionalGeneration": _HfExamplesInfo(
        "CohereLabs/command-a-vision-07-2025"
    ),

--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1203,6 +1203,7 @@ class ModelConfig:
            "gemma3",
            "molmo2",
            "paligemma",
+            "umm",
        )
        if not hasattr(self.hf_config, "model_type"):
            return False

--- a/vllm/model_executor/models/cheers.py
+++ b/vllm/model_executor/models/cheers.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -350,6 +350,8 @@ _MULTIMODAL_MODELS = {
        "chameleon",
        "ChameleonForConditionalGeneration",
    ),
+    "Cheers": ("cheers", "CheersForConditionalGeneration"),
+    "CheersForConditionalGeneration": ("cheers", "CheersForConditionalGeneration"),
    "Cohere2VisionForConditionalGeneration": (
        "cohere2_vision",
        "Cohere2VisionForConditionalGeneration",

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -80,6 +80,7 @@ class LazyConfigDict(dict):
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
    afmoe="AfmoeConfig",
    bagel="BagelConfig",
+    umm="CheersConfig",
    chatglm="ChatGLMConfig",
    colmodernvbert="ColModernVBertConfig",
    colpali="ColPaliConfig",

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -18,6 +18,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
    "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
    "AXK1Config": "vllm.transformers_utils.configs.AXK1",
    "BagelConfig": "vllm.transformers_utils.configs.bagel",
+    "CheersConfig": "vllm.transformers_utils.configs.cheers",
    "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
    "ColModernVBertConfig": "vllm.transformers_utils.configs.colmodernvbert",
    "ColPaliConfig": "vllm.transformers_utils.configs.colpali",
@@ -75,6 +76,7 @@ __all__ = [
    "AfmoeConfig",
    "AXK1Config",
    "BagelConfig",
+    "CheersConfig",
    "ChatGLMConfig",
    "ColModernVBertConfig",
    "ColPaliConfig",

--- a/vllm/transformers_utils/configs/cheers.py
+++ b/vllm/transformers_utils/configs/cheers.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import PretrainedConfig, SiglipVisionConfig
+from transformers.modeling_rope_utils import rope_config_validation
+class CheersTextConfig(PretrainedConfig):
+    """Qwen2-based text config with Cheers-specific defaults."""
+    model_type = "umm"
+    base_config_key = "text_config"
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=3584,
+        intermediate_size=18944,
+        num_hidden_layers=28,
+        num_attention_heads=28,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=131072,
+        max_window_layers=28,
+        layer_types=None,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if self.use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+class CheersConfig(PretrainedConfig):
+    """Configuration class for Cheers (UMM) model."""
+    model_type = "umm"
+    def __init__(
+        self,
+        text_config: dict | CheersTextConfig | None = None,
+        vision_representation_config: dict | SiglipVisionConfig | None = None,
+        vae_encoder_config: dict | None = None,
+        vae_decoder_config: dict | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if isinstance(text_config, dict):
+            self.text_config = CheersTextConfig(**text_config)
+        else:
+            self.text_config = text_config or CheersTextConfig()
+        if isinstance(vision_representation_config, dict):
+            self.vision_representation_config = SiglipVisionConfig(
+                **vision_representation_config
+            )
+        else:
+            self.vision_representation_config = (
+                vision_representation_config or SiglipVisionConfig()
+            )
+        self.vae_encoder_config = vae_encoder_config or {"resolution": 512}
+        self.vae_decoder_config = vae_decoder_config or {"resolution": 512}
+    @property
+    def hidden_size(self) -> int:
+        """Return the hidden size of the language model."""
+        return self.text_config.hidden_size
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -12,6 +12,7 @@ import importlib
 __all__ = [
    "BagelProcessor",
+    "CheersProcessor",
    "CohereASRProcessor",
    "DeepseekVLV2Processor",
    "FireRedASR2Processor",
@@ -39,6 +40,7 @@ __all__ = [
 _CLASS_TO_MODULE: dict[str, str] = {
    "BagelProcessor": "vllm.transformers_utils.processors.bagel",
+    "CheersProcessor": "vllm.transformers_utils.processors.cheers",
    "CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
    "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
    "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",

--- a/vllm/transformers_utils/processors/cheers.py
+++ b/vllm/transformers_utils/processors/cheers.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cheers (UMM) processor for image and text inputs."""
+from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+class CheersProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
+    _defaults = {
+        "images_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
+class CheersProcessor(ProcessorMixin):
+    """
+    Constructs a Cheers processor which wraps a
+    SigLIP image processor and a Qwen2 tokenizer.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __call__(
+        self,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        images: ImageInput = None,
+        **kwargs: Unpack[CheersProcessorKwargs],
+    ):
+        output_kwargs = self._merge_kwargs(
+            CheersProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            import torch
+            if isinstance(images, (list, tuple)):
+                all_pv = []
+                all_ghw = []
+                for img in images:
+                    result = self.image_processor(img, **output_kwargs["images_kwargs"])
+                    all_pv.append(result["pixel_values"])
+                    if "grid_hws" in result:
+                        all_ghw.append(result["grid_hws"])
+                pixel_values = {
+                    "pixel_values": torch.cat(all_pv, dim=0),
+                }
+                if all_ghw:
+                    pixel_values["grid_hws"] = torch.cat(all_ghw, dim=0)
+            else:
+                pixel_values = self.image_processor(
+                    images, **output_kwargs["images_kwargs"]
+                )
+        else:
+            pixel_values = {}
+        text_inputs = (
+            self.tokenizer(text, **output_kwargs["text_kwargs"])
+            if text is not None
+            else {}
+        )
+        return BatchFeature(data={**pixel_values, **text_inputs})
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+AutoProcessor.register("CheersProcessor", CheersProcessor)