[Bugfix] Fix prompt format of GLM4V (#14539)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Bugfix] Fix prompt format of GLM4V (#14539)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
f53a0586 · Cyrus Leung · GitHub · b1cc4dfe · f53a0586 · f53a0586
Unverified Commit f53a0586 authored Mar 13, 2025 by Cyrus Leung Committed by GitHub Mar 13, 2025
7 changed files
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -254,13 +254,21 @@ VLM_TEST_SETTINGS = {
    "glm4v": VLMTestInfo(
        models=["THUDM/glm-4v-9b"],
        test_type=VLMTestType.IMAGE,
-        prompt_formatter=identity,
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "",
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
+        }),
        max_model_len=2048,
        max_num_seqs=2,
        dtype="bfloat16",
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
-        patch_hf_runner=model_utils.glm_patch_hf_runner,
+        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
+        # The image embeddings match with HF but the outputs of the language
+        # decoder are only consistent up to 2 decimal places.
+        # So, we need to reduce the number of tokens for the test to pass.
+        max_tokens=8,
+        num_logprobs=10,
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "h2ovl": VLMTestInfo(

--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -61,7 +61,9 @@ def run_test(
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    vllm_runner_kwargs_: dict[str, Any] = {}
+    vllm_runner_kwargs_: dict[str, Any] = {
+        "disable_mm_preprocessor_cache": True,
+    }
    if model_info.tokenizer:
        vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
    if model_info.tokenizer_mode:

--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -316,8 +316,8 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model
-def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
-    """Patches and returns an instance of the HfRunner to use for GLM4."""
+    """Patches and returns an instance of the HfRunner to use for GLM4V."""
    hf_processor = hf_model.processor
    patch_padding_side(hf_processor)
@@ -325,12 +325,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        if images is None:
            return hf_processor(*args, **kwargs)
+        images = [images] if isinstance(images, Image) else images
+        contents = re.findall(
+            r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
+            text,
+        )
+        assert len(contents) == len(images)
        return hf_processor.apply_chat_template(
            [{
                "role": "user",
-                "image": images,
+                "image": image,
-                "content": text
+                "content": content
-            }],
+            } for image, content in zip(images, contents)],
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -286,14 +286,18 @@ class ModelConfig:
        if rope_scaling is not None:
            hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
            hf_overrides_kw.update(hf_override)
-            msg = ("`--rope-scaling` will be removed in a future release. "
+            hf_overrides_str = json.dumps(hf_overrides)
-                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
+            msg = (
+                "`--rope-scaling` will be removed in a future release. "
+                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
            warnings.warn(DeprecationWarning(msg), stacklevel=2)
        if rope_theta is not None:
            hf_override = {"rope_theta": rope_theta}
            hf_overrides_kw.update(hf_override)
-            msg = ("`--rope-theta` will be removed in a future release. "
+            hf_overrides_str = json.dumps(hf_overrides)
-                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
+            msg = (
+                "`--rope-theta` will be removed in a future release. "
+                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
            warnings.warn(DeprecationWarning(msg), stacklevel=2)
        self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        hf_config = self._model_config.hf_config
        model_type = hf_config.model_type
-        if modality in ["image", "image_embeds"]:
+        if modality in ("image", "image_embeds"):
+            if model_type == "chatglm":
+                return "<|begin_of_image|><|endoftext|><|end_of_image|>"
            if model_type == "phi3_v":
                # Workaround since this token is not defined in the tokenizer
                return f"<|image_{current_count}|>"
@@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
            if model_type in ("minicpmo", "minicpmv"):
                return "(<image>./</image>)"
-            if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
+            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
-                              "pixtral"):
                # These models do not use image tokens in the prompt
                return None
            if model_type == "qwen":

--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -2,6 +2,7 @@
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
+import json
 from typing import Iterable, Optional, Set, Tuple, Union
 import torch
@@ -463,7 +464,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
                "The configuration of this model indicates that it supports "
                "vision inputs, but you instantiated the text-only version "
                "of this model. Please use the vision model by setting "
-                f"`--hf-overrides {hf_overrides!r}`")
+                f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
        super().__init__(vllm_config=vllm_config, prefix=prefix)

--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -5,7 +5,7 @@
 # Copyright (c) Alibaba Cloud.
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
+import json
 from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 import torch
@@ -354,7 +354,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
                "The configuration of this model indicates that it supports "
                "vision inputs, but you instantiated the text-only version "
                "of this model. Please use the vision model by setting "
-                f"`--hf-overrides {hf_overrides!r}`")
+                f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
        super().__init__(vllm_config=vllm_config, prefix=prefix)