Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f53a0586
Unverified
Commit
f53a0586
authored
Mar 13, 2025
by
Cyrus Leung
Committed by
GitHub
Mar 13, 2025
Browse files
[Bugfix] Fix prompt format of GLM4V (#14539)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
b1cc4dfe
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
43 additions
and
19 deletions
+43
-19
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+11
-3
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+3
-1
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+13
-5
vllm/config.py
vllm/config.py
+8
-4
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+4
-3
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+2
-1
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+2
-2
No files found.
tests/models/decoder_only/vision_language/test_models.py
View file @
f53a0586
...
...
@@ -254,13 +254,21 @@ VLM_TEST_SETTINGS = {
"glm4v"
:
VLMTestInfo
(
models
=
[
"THUDM/glm-4v-9b"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
identity
,
img_idx_to_prompt
=
lambda
idx
:
""
,
prompt_formatter
=
lambda
img_prompt
:
f
"<|user|>
\n
{
img_prompt
}
<|assistant|>"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?"
,
# noqa: E501
}),
max_model_len
=
2048
,
max_num_seqs
=
2
,
dtype
=
"bfloat16"
,
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
patch_hf_runner
=
model_utils
.
glm_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
glm4v_patch_hf_runner
,
# The image embeddings match with HF but the outputs of the language
# decoder are only consistent up to 2 decimal places.
# So, we need to reduce the number of tokens for the test to pass.
max_tokens
=
8
,
num_logprobs
=
10
,
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
"h2ovl"
:
VLMTestInfo
(
...
...
tests/models/decoder_only/vision_language/vlm_utils/core.py
View file @
f53a0586
...
...
@@ -61,7 +61,9 @@ def run_test(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_runner_kwargs_
:
dict
[
str
,
Any
]
=
{}
vllm_runner_kwargs_
:
dict
[
str
,
Any
]
=
{
"disable_mm_preprocessor_cache"
:
True
,
}
if
model_info
.
tokenizer
:
vllm_runner_kwargs_
[
"tokenizer"
]
=
model_info
.
tokenizer
if
model_info
.
tokenizer_mode
:
...
...
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
View file @
f53a0586
...
...
@@ -316,8 +316,8 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
def
glm_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for GLM4."""
def
glm
4v
_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for GLM4
V
."""
hf_processor
=
hf_model
.
processor
patch_padding_side
(
hf_processor
)
...
...
@@ -325,12 +325,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
if
images
is
None
:
return
hf_processor
(
*
args
,
**
kwargs
)
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
contents
=
re
.
findall
(
r
"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>"
,
text
,
)
assert
len
(
contents
)
==
len
(
images
)
return
hf_processor
.
apply_chat_template
(
[{
"role"
:
"user"
,
"image"
:
image
s
,
"content"
:
te
x
t
}],
"image"
:
image
,
"content"
:
con
te
n
t
}
for
image
,
content
in
zip
(
images
,
contents
)
],
add_generation_prompt
=
True
,
tokenize
=
True
,
return_dict
=
True
,
...
...
vllm/config.py
View file @
f53a0586
...
...
@@ -286,14 +286,18 @@ class ModelConfig:
if
rope_scaling
is
not
None
:
hf_override
:
dict
[
str
,
Any
]
=
{
"rope_scaling"
:
rope_scaling
}
hf_overrides_kw
.
update
(
hf_override
)
msg
=
(
"`--rope-scaling` will be removed in a future release. "
f
"'Please instead use `--hf-overrides '
{
hf_override
!
r
}
'`"
)
hf_overrides_str
=
json
.
dumps
(
hf_overrides
)
msg
=
(
"`--rope-scaling` will be removed in a future release. "
f
"'Please instead use `--hf-overrides '
{
hf_overrides_str
}
'`"
)
warnings
.
warn
(
DeprecationWarning
(
msg
),
stacklevel
=
2
)
if
rope_theta
is
not
None
:
hf_override
=
{
"rope_theta"
:
rope_theta
}
hf_overrides_kw
.
update
(
hf_override
)
msg
=
(
"`--rope-theta` will be removed in a future release. "
f
"'Please instead use `--hf-overrides '
{
hf_override
!
r
}
'`"
)
hf_overrides_str
=
json
.
dumps
(
hf_overrides
)
msg
=
(
"`--rope-theta` will be removed in a future release. "
f
"'Please instead use `--hf-overrides '
{
hf_overrides_str
}
'`"
)
warnings
.
warn
(
DeprecationWarning
(
msg
),
stacklevel
=
2
)
self
.
maybe_pull_model_tokenizer_for_s3
(
model
,
tokenizer
)
...
...
vllm/entrypoints/chat_utils.py
View file @
f53a0586
...
...
@@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
hf_config
=
self
.
_model_config
.
hf_config
model_type
=
hf_config
.
model_type
if
modality
in
[
"image"
,
"image_embeds"
]:
if
modality
in
(
"image"
,
"image_embeds"
):
if
model_type
==
"chatglm"
:
return
"<|begin_of_image|><|endoftext|><|end_of_image|>"
if
model_type
==
"phi3_v"
:
# Workaround since this token is not defined in the tokenizer
return
f
"<|image_
{
current_count
}
|>"
...
...
@@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return
"<|endoftext10|>"
# 200010 (see vocab.json in hf model)
if
model_type
in
(
"minicpmo"
,
"minicpmv"
):
return
"(<image>./</image>)"
if
model_type
in
(
"blip-2"
,
"chatglm"
,
"fuyu"
,
"paligemma"
,
"pixtral"
):
if
model_type
in
(
"blip-2"
,
"fuyu"
,
"paligemma"
,
"pixtral"
):
# These models do not use image tokens in the prompt
return
None
if
model_type
==
"qwen"
:
...
...
vllm/model_executor/models/chatglm.py
View file @
f53a0586
...
...
@@ -2,6 +2,7 @@
# Adapted from
# https://github.com/THUDM/ChatGLM2-6B
"""Inference-only ChatGLM model compatible with THUDM weights."""
import
json
from
typing
import
Iterable
,
Optional
,
Set
,
Tuple
,
Union
import
torch
...
...
@@ -463,7 +464,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
f
"`--hf-overrides
{
hf_overrides
!
r
}
`"
)
f
"`--hf-overrides
'
{
json
.
dumps
(
hf_overrides
)
}
'
`"
)
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
...
...
vllm/model_executor/models/qwen.py
View file @
f53a0586
...
...
@@ -5,7 +5,7 @@
# Copyright (c) Alibaba Cloud.
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
"""Inference-only QWen model compatible with HuggingFace weights."""
import
json
from
typing
import
Any
,
Dict
,
Iterable
,
Optional
,
Set
,
Tuple
,
Union
import
torch
...
...
@@ -354,7 +354,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
f
"`--hf-overrides
{
hf_overrides
!
r
}
`"
)
f
"`--hf-overrides
'
{
json
.
dumps
(
hf_overrides
)
}
'
`"
)
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment