Unverified Commit f53a0586 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Bugfix] Fix prompt format of GLM4V (#14539)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent b1cc4dfe
...@@ -254,13 +254,21 @@ VLM_TEST_SETTINGS = { ...@@ -254,13 +254,21 @@ VLM_TEST_SETTINGS = {
"glm4v": VLMTestInfo( "glm4v": VLMTestInfo(
models=["THUDM/glm-4v-9b"], models=["THUDM/glm-4v-9b"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=identity, prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
img_idx_to_prompt=lambda idx: "", single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
}),
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16", dtype="bfloat16",
get_stop_token_ids=lambda tok: [151329, 151336, 151338], get_stop_token_ids=lambda tok: [151329, 151336, 151338],
patch_hf_runner=model_utils.glm_patch_hf_runner, patch_hf_runner=model_utils.glm4v_patch_hf_runner,
# The image embeddings match with HF but the outputs of the language
# decoder are only consistent up to 2 decimal places.
# So, we need to reduce the number of tokens for the test to pass.
max_tokens=8,
num_logprobs=10,
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"h2ovl": VLMTestInfo( "h2ovl": VLMTestInfo(
......
...@@ -61,7 +61,9 @@ def run_test( ...@@ -61,7 +61,9 @@ def run_test(
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method (the default method).
vllm_runner_kwargs_: dict[str, Any] = {} vllm_runner_kwargs_: dict[str, Any] = {
"disable_mm_preprocessor_cache": True,
}
if model_info.tokenizer: if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
if model_info.tokenizer_mode: if model_info.tokenizer_mode:
......
...@@ -316,8 +316,8 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -316,8 +316,8 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model return hf_model
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4.""" """Patches and returns an instance of the HfRunner to use for GLM4V."""
hf_processor = hf_model.processor hf_processor = hf_model.processor
patch_padding_side(hf_processor) patch_padding_side(hf_processor)
...@@ -325,12 +325,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -325,12 +325,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
if images is None: if images is None:
return hf_processor(*args, **kwargs) return hf_processor(*args, **kwargs)
images = [images] if isinstance(images, Image) else images
contents = re.findall(
r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
text,
)
assert len(contents) == len(images)
return hf_processor.apply_chat_template( return hf_processor.apply_chat_template(
[{ [{
"role": "user", "role": "user",
"image": images, "image": image,
"content": text "content": content
}], } for image, content in zip(images, contents)],
add_generation_prompt=True, add_generation_prompt=True,
tokenize=True, tokenize=True,
return_dict=True, return_dict=True,
......
...@@ -286,14 +286,18 @@ class ModelConfig: ...@@ -286,14 +286,18 @@ class ModelConfig:
if rope_scaling is not None: if rope_scaling is not None:
hf_override: dict[str, Any] = {"rope_scaling": rope_scaling} hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
hf_overrides_kw.update(hf_override) hf_overrides_kw.update(hf_override)
msg = ("`--rope-scaling` will be removed in a future release. " hf_overrides_str = json.dumps(hf_overrides)
f"'Please instead use `--hf-overrides '{hf_override!r}'`") msg = (
"`--rope-scaling` will be removed in a future release. "
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2) warnings.warn(DeprecationWarning(msg), stacklevel=2)
if rope_theta is not None: if rope_theta is not None:
hf_override = {"rope_theta": rope_theta} hf_override = {"rope_theta": rope_theta}
hf_overrides_kw.update(hf_override) hf_overrides_kw.update(hf_override)
msg = ("`--rope-theta` will be removed in a future release. " hf_overrides_str = json.dumps(hf_overrides)
f"'Please instead use `--hf-overrides '{hf_override!r}'`") msg = (
"`--rope-theta` will be removed in a future release. "
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2) warnings.warn(DeprecationWarning(msg), stacklevel=2)
self.maybe_pull_model_tokenizer_for_s3(model, tokenizer) self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
......
...@@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
hf_config = self._model_config.hf_config hf_config = self._model_config.hf_config
model_type = hf_config.model_type model_type = hf_config.model_type
if modality in ["image", "image_embeds"]: if modality in ("image", "image_embeds"):
if model_type == "chatglm":
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
if model_type == "phi3_v": if model_type == "phi3_v":
# Workaround since this token is not defined in the tokenizer # Workaround since this token is not defined in the tokenizer
return f"<|image_{current_count}|>" return f"<|image_{current_count}|>"
...@@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|endoftext10|>" # 200010 (see vocab.json in hf model) return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
if model_type in ("minicpmo", "minicpmv"): if model_type in ("minicpmo", "minicpmv"):
return "(<image>./</image>)" return "(<image>./</image>)"
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma", if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
"pixtral"):
# These models do not use image tokens in the prompt # These models do not use image tokens in the prompt
return None return None
if model_type == "qwen": if model_type == "qwen":
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# Adapted from # Adapted from
# https://github.com/THUDM/ChatGLM2-6B # https://github.com/THUDM/ChatGLM2-6B
"""Inference-only ChatGLM model compatible with THUDM weights.""" """Inference-only ChatGLM model compatible with THUDM weights."""
import json
from typing import Iterable, Optional, Set, Tuple, Union from typing import Iterable, Optional, Set, Tuple, Union
import torch import torch
...@@ -463,7 +464,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP): ...@@ -463,7 +464,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
"The configuration of this model indicates that it supports " "The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version " "vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting " "of this model. Please use the vision model by setting "
f"`--hf-overrides {hf_overrides!r}`") f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
super().__init__(vllm_config=vllm_config, prefix=prefix) super().__init__(vllm_config=vllm_config, prefix=prefix)
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# Copyright (c) Alibaba Cloud. # Copyright (c) Alibaba Cloud.
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
"""Inference-only QWen model compatible with HuggingFace weights.""" """Inference-only QWen model compatible with HuggingFace weights."""
import json
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
import torch import torch
...@@ -354,7 +354,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA): ...@@ -354,7 +354,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
"The configuration of this model indicates that it supports " "The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version " "vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting " "of this model. Please use the vision model by setting "
f"`--hf-overrides {hf_overrides!r}`") f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
super().__init__(vllm_config=vllm_config, prefix=prefix) super().__init__(vllm_config=vllm_config, prefix=prefix)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment