Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
bcc213df
Unverified
Commit
bcc213df
authored
Feb 16, 2025
by
Mick
Committed by
GitHub
Feb 16, 2025
Browse files
Model: Support Qwen 2.5 vl (#3258)
parent
39416e39
Changes
11
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1999 additions
and
261 deletions
+1999
-261
docs/references/supported_models.md
docs/references/supported_models.md
+2
-2
python/sglang/lang/chat_template.py
python/sglang/lang/chat_template.py
+8
-0
python/sglang/srt/configs/__init__.py
python/sglang/srt/configs/__init__.py
+6
-3
python/sglang/srt/configs/model_config.py
python/sglang/srt/configs/model_config.py
+1
-0
python/sglang/srt/configs/qwen2_5_vl_config.py
python/sglang/srt/configs/qwen2_5_vl_config.py
+1003
-0
python/sglang/srt/configs/qwen2vl.py
python/sglang/srt/configs/qwen2vl.py
+0
-130
python/sglang/srt/hf_transformers_utils.py
python/sglang/srt/hf_transformers_utils.py
+2
-3
python/sglang/srt/managers/image_processor.py
python/sglang/srt/managers/image_processor.py
+217
-122
python/sglang/srt/models/qwen2_5_vl.py
python/sglang/srt/models/qwen2_5_vl.py
+722
-0
python/sglang/srt/models/qwen2_vl.py
python/sglang/srt/models/qwen2_vl.py
+2
-1
test/srt/test_vision_openai_server.py
test/srt/test_vision_openai_server.py
+36
-0
No files found.
docs/references/supported_models.md
View file @
bcc213df
...
...
@@ -4,7 +4,7 @@
-
Llama / Llama 2 / Llama 3 / Llama 3.1 / Llama 3.2
-
Mistral / Mixtral / Mistral NeMo / Mistral Small 3
-
Gemma / Gemma 2
-
Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
-
Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
/ Qwen 2.5 VL
-
DeepSeek / DeepSeek 2 /
[
DeepSeek 3
](
https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3
)
-
OLMoE
-
[
LLaVA-OneVision
](
https://llava-vl.github.io/blog/2024-08-05-llava-onevision/
)
...
...
@@ -54,7 +54,7 @@ To support a new model in SGLang, you only need to add a single file under [SGLa
You can learn from existing model implementations and create new files for the new models.
For most models, you should be able to find a similar model to start with (e.g., starting from Llama).
## How to Support a New v
ision L
LM
## How to Support a New vLM
To support a new vision-language model (vLM) in SGLang, there are several key components in addition to the standard LLM.
...
...
python/sglang/lang/chat_template.py
View file @
bcc213df
...
...
@@ -427,6 +427,8 @@ def match_chat_ml(model_path: str):
if
"tinyllama"
in
model_path
:
return
get_chat_template
(
"chatml"
)
# Now the suffix for qwen2 chat model is "instruct"
if
"qwen"
in
model_path
and
"vl"
in
model_path
:
return
get_chat_template
(
"qwen2-vl"
)
if
"qwen"
in
model_path
:
if
"vl"
in
model_path
:
return
get_chat_template
(
"qwen2-vl"
)
...
...
@@ -443,6 +445,12 @@ def match_chat_ml(model_path: str):
return
get_chat_template
(
"chatml-llava"
)
@
register_chat_template_matching_function
def
match_chat_minicpm
(
model_path
:
str
):
if
"minicpm"
in
model_path
:
return
get_chat_template
(
"minicpmv"
)
@
register_chat_template_matching_function
def
match_chat_yi
(
model_path
:
str
):
model_path
=
model_path
.
lower
()
...
...
python/sglang/srt/configs/__init__.py
View file @
bcc213df
from
sglang.srt.configs.chatglm
import
ChatGLMConfig
from
sglang.srt.configs.dbrx
import
DbrxConfig
from
sglang.srt.configs.exaone
import
ExaoneConfig
from
sglang.srt.configs.qwen2vl
import
Qwen2VLConfig
,
Qwen2VLVisionConfig
from
sglang.srt.configs.qwen2_5_vl_config
import
(
Qwen2_5_VLConfig
,
Qwen2_5_VLVisionConfig
,
)
__all__
=
[
"ExaoneConfig"
,
"Qwen2VLConfig"
,
"Qwen2VLVisionConfig"
,
"ChatGLMConfig"
,
"DbrxConfig"
,
"Qwen2_5_VLConfig"
,
"Qwen2_5_VLVisionConfig"
,
]
python/sglang/srt/configs/model_config.py
View file @
bcc213df
...
...
@@ -403,6 +403,7 @@ def is_multimodal_model(model_architectures: List[str]):
or
"LlavaVidForCausalLM"
in
model_architectures
or
"MllamaForConditionalGeneration"
in
model_architectures
or
"Qwen2VLForConditionalGeneration"
in
model_architectures
or
"Qwen2_5_VLForConditionalGeneration"
in
model_architectures
or
"MiniCPMV"
in
model_architectures
):
return
True
...
...
python/sglang/srt/configs/qwen2_5_vl_config.py
0 → 100644
View file @
bcc213df
This diff is collapsed.
Click to expand it.
python/sglang/srt/configs/qwen2vl.py
deleted
100644 → 0
View file @
39416e39
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen2VL model configuration"""
import
os
from
typing
import
Union
from
transformers
import
PretrainedConfig
class
Qwen2VLVisionConfig
(
PretrainedConfig
):
model_type
=
"qwen2_vl"
def
__init__
(
self
,
depth
=
32
,
embed_dim
=
1280
,
hidden_size
=
3584
,
hidden_act
=
"quick_gelu"
,
mlp_ratio
=
4
,
num_heads
=
16
,
in_channels
=
3
,
patch_size
=
14
,
spatial_merge_size
=
2
,
temporal_patch_size
=
2
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
self
.
depth
=
depth
self
.
embed_dim
=
embed_dim
self
.
hidden_size
=
hidden_size
self
.
hidden_act
=
hidden_act
self
.
mlp_ratio
=
mlp_ratio
self
.
num_heads
=
num_heads
self
.
in_channels
=
in_channels
self
.
patch_size
=
patch_size
self
.
spatial_merge_size
=
spatial_merge_size
self
.
temporal_patch_size
=
temporal_patch_size
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
:
Union
[
str
,
os
.
PathLike
],
**
kwargs
)
->
"PretrainedConfig"
:
cls
.
_set_token_in_kwargs
(
kwargs
)
config_dict
,
kwargs
=
cls
.
get_config_dict
(
pretrained_model_name_or_path
,
**
kwargs
)
if
config_dict
.
get
(
"model_type"
)
==
"qwen2_vl"
:
config_dict
=
config_dict
[
"vision_config"
]
return
cls
.
from_dict
(
config_dict
,
**
kwargs
)
class
Qwen2VLConfig
(
PretrainedConfig
):
model_type
=
"qwen2_vl"
def
__init__
(
self
,
vocab_size
=
152064
,
hidden_size
=
8192
,
intermediate_size
=
29568
,
num_hidden_layers
=
80
,
num_attention_heads
=
64
,
num_key_value_heads
=
8
,
hidden_act
=
"silu"
,
max_position_embeddings
=
32768
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-05
,
use_cache
=
True
,
tie_word_embeddings
=
False
,
rope_theta
=
1000000.0
,
use_sliding_window
=
False
,
sliding_window
=
4096
,
max_window_layers
=
80
,
attention_dropout
=
0.0
,
vision_config
=
None
,
rope_scaling
=
None
,
**
kwargs
,
):
if
isinstance
(
vision_config
,
dict
):
self
.
vision_config
=
Qwen2VLVisionConfig
(
**
vision_config
)
elif
vision_config
is
None
:
self
.
vision_config
=
Qwen2VLVisionConfig
()
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
use_sliding_window
=
use_sliding_window
self
.
sliding_window
=
sliding_window
self
.
max_window_layers
=
max_window_layers
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
attention_dropout
=
attention_dropout
self
.
rope_scaling
=
rope_scaling
# NOTE(HandH1998): This is necessary for configuring the `rope_type`` of qwen2vl models after removing dependencies on vllm.
if
self
.
rope_scaling
is
not
None
and
"type"
in
self
.
rope_scaling
:
if
self
.
rope_scaling
[
"type"
]
==
"mrope"
:
self
.
rope_scaling
[
"type"
]
=
"default"
self
.
rope_scaling
[
"rope_type"
]
=
self
.
rope_scaling
[
"type"
]
super
().
__init__
(
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
)
python/sglang/srt/hf_transformers_utils.py
View file @
bcc213df
...
...
@@ -30,16 +30,15 @@ from transformers import (
)
from
transformers.models.auto.modeling_auto
import
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from
sglang.srt.configs
import
ChatGLMConfig
,
DbrxConfig
,
ExaoneConfig
,
Qwen2VLConfig
from
sglang.srt.configs
import
ChatGLMConfig
,
DbrxConfig
,
ExaoneConfig
,
Qwen2
_5_
VLConfig
_CONFIG_REGISTRY
:
Dict
[
str
,
Type
[
PretrainedConfig
]]
=
{
ChatGLMConfig
.
model_type
:
ChatGLMConfig
,
DbrxConfig
.
model_type
:
DbrxConfig
,
ExaoneConfig
.
model_type
:
ExaoneConfig
,
Qwen2VLConfig
.
model_type
:
Qwen2VLConfig
,
Qwen2
_5_
VLConfig
.
model_type
:
Qwen2
_5_
VLConfig
,
}
for
name
,
cls
in
_CONFIG_REGISTRY
.
items
():
with
contextlib
.
suppress
(
ValueError
):
AutoConfig
.
register
(
name
,
cls
)
...
...
python/sglang/srt/managers/image_processor.py
View file @
bcc213df
# TODO: also move pad_input_ids into this module
import
asyncio
import
concurrent.futures
import
dataclasses
import
logging
import
multiprocessing
as
mp
import
os
...
...
@@ -8,6 +9,7 @@ from abc import ABC, abstractmethod
from
typing
import
List
,
Optional
,
Union
import
numpy
as
np
import
PIL
import
transformers
from
decord
import
VideoReader
,
cpu
from
PIL
import
Image
...
...
@@ -34,11 +36,22 @@ def init_global_processor(server_args: ServerArgs):
)
@
dataclasses
.
dataclass
class
BaseImageProcessorOutput
:
image_hashes
:
list
[
int
]
image_sizes
:
list
[
int
]
all_frames
:
[
PIL
.
Image
]
# input_text, with each frame of video/image represented with a image_token
input_text
:
str
class
BaseImageProcessor
(
ABC
):
def
__init__
(
self
,
hf_config
,
server_args
,
_processor
):
self
.
hf_config
=
hf_config
self
.
_processor
=
_processor
self
.
server_args
=
server_args
# FIXME: not accurate, model and image specific
self
.
NUM_TOKEN_PER_FRAME
=
330
self
.
executor
=
concurrent
.
futures
.
ProcessPoolExecutor
(
initializer
=
init_global_processor
,
...
...
@@ -48,9 +61,128 @@ class BaseImageProcessor(ABC):
)
@
abstractmethod
async
def
process_images_async
(
self
,
image_data
,
input_text
,
**
kwargs
):
async
def
process_images_async
(
self
,
image_data
,
input_text
,
max_req_input_len
,
**
kwargs
):
pass
def
get_estimated_frames_list
(
self
,
image_data
):
"""
estimate the total frame count from all visual input
"""
# Before processing inputs
estimated_frames_list
=
[]
for
image
in
image_data
:
if
isinstance
(
image
,
str
)
and
image
.
startswith
(
"video:"
):
path
=
image
[
len
(
"video:"
)
:]
# Estimate frames for the video
vr
=
VideoReader
(
path
,
ctx
=
cpu
(
0
))
num_frames
=
len
(
vr
)
else
:
# For images, each contributes one frame
num_frames
=
1
estimated_frames_list
.
append
(
num_frames
)
return
estimated_frames_list
def
encode_video
(
self
,
video_path
,
frame_count_limit
=
None
):
if
not
os
.
path
.
exists
(
video_path
):
logger
.
error
(
f
"Video
{
video_path
}
does not exist"
)
return
[]
if
frame_count_limit
==
0
:
return
[]
def
uniform_sample
(
l
,
n
):
gap
=
len
(
l
)
/
n
idxs
=
[
int
(
i
*
gap
+
gap
/
2
)
for
i
in
range
(
n
)]
return
[
l
[
i
]
for
i
in
idxs
]
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
sample_fps
=
round
(
vr
.
get_avg_fps
()
/
1
)
# FPS
frame_idx
=
[
i
for
i
in
range
(
0
,
len
(
vr
),
sample_fps
)]
if
frame_count_limit
is
not
None
and
len
(
frame_idx
)
>
frame_count_limit
:
frame_idx
=
uniform_sample
(
frame_idx
,
frame_count_limit
)
frames
=
vr
.
get_batch
(
frame_idx
).
asnumpy
()
frames
=
[
Image
.
fromarray
(
v
.
astype
(
"uint8"
))
for
v
in
frames
]
return
frames
def
load_images
(
self
,
max_req_input_len
:
int
,
input_ids
:
list
,
image_data
,
image_token
:
str
,
)
->
BaseImageProcessorOutput
:
"""
Each frame of video/image will be replaced by a single image token
"""
image_hashes
,
image_sizes
=
[],
[]
all_frames
=
[]
new_text_parts
=
[]
if
isinstance
(
input_ids
,
list
):
assert
len
(
input_ids
)
and
isinstance
(
input_ids
[
0
],
int
)
input_text
=
self
.
_processor
.
tokenizer
.
decode
(
input_ids
)
else
:
input_text
=
input_ids
text_parts
=
input_text
.
split
(
image_token
)
# roughly calculate the max number of frames under the max_req_input_len limit
def
calculate_max_num_frames
()
->
int
:
ret
=
(
max_req_input_len
-
len
(
input_ids
))
//
self
.
NUM_TOKEN_PER_FRAME
return
min
(
ret
,
100
)
MAX_NUM_FRAMES
=
calculate_max_num_frames
()
estimated_frames_list
=
self
.
get_estimated_frames_list
(
image_data
=
image_data
)
total_frame_count
=
sum
(
estimated_frames_list
)
# a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
# e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
scaling_factor
=
min
(
1.0
,
MAX_NUM_FRAMES
/
total_frame_count
)
# Process each input with allocated frames
for
image_index
,
(
image
,
estimated_frames
)
in
enumerate
(
zip
(
image_data
,
estimated_frames_list
)
):
if
len
(
all_frames
)
>=
MAX_NUM_FRAMES
:
frames_to_process
=
0
else
:
frames_to_process
=
max
(
1
,
int
(
estimated_frames
*
scaling_factor
))
if
frames_to_process
==
0
:
frames
=
[]
else
:
try
:
if
isinstance
(
image
,
str
)
and
image
.
startswith
(
"video:"
):
path
=
image
[
len
(
"video:"
)
:]
frames
=
self
.
encode_video
(
path
,
frame_count_limit
=
frames_to_process
)
else
:
raw_image
,
_size
=
load_image
(
image
)
frames
=
[
raw_image
]
if
len
(
frames
)
==
0
:
continue
except
FileNotFoundError
as
e
:
print
(
e
)
return
None
image_sizes
+=
frames
[
0
].
size
*
len
(
frames
)
image_hashes
+=
[
hash
(
image
)]
*
len
(
frames
)
all_frames
+=
frames
new_text_parts
.
append
(
text_parts
[
image_index
])
if
frames_to_process
!=
0
:
new_text_parts
.
append
(
image_token
*
len
(
frames
))
assert
frames_to_process
==
len
(
frames
)
new_text_parts
.
append
(
text_parts
[
-
1
])
input_text
=
""
.
join
(
new_text_parts
)
return
BaseImageProcessorOutput
(
image_hashes
,
image_sizes
,
all_frames
,
input_text
)
class
DummyImageProcessor
(
BaseImageProcessor
):
def
__init__
(
self
):
...
...
@@ -248,9 +380,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
text
=
input_text
,
images
=
images
,
return_tensors
=
"pt"
)
return
{
"input_ids"
:
result
[
"
input_ids
"
]
,
"pixel_values"
:
result
[
"
pixel_values
"
]
,
"tgt_sizes"
:
result
[
"
tgt_sizes
"
]
,
"input_ids"
:
result
.
input_ids
,
"pixel_values"
:
result
.
pixel_values
,
"tgt_sizes"
:
result
.
tgt_sizes
,
}
async
def
_process_images
(
self
,
images
,
input_text
):
...
...
@@ -278,124 +410,20 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
):
if
not
image_data
:
return
None
if
not
isinstance
(
image_data
,
list
):
image_data
=
[
image_data
]
image_hashes
,
image_sizes
=
[],
[]
all_frames
=
[]
# roughly calculate the max number of frames under the max_req_input_len limit
def
calculate_max_num_frames
()
->
int
:
# Model-specific
NUM_TOKEN_PER_FRAME
=
330
ret
=
(
max_req_input_len
-
len
(
input_ids
))
//
NUM_TOKEN_PER_FRAME
return
min
(
ret
,
100
)
MAX_NUM_FRAMES
=
calculate_max_num_frames
()
# print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
def
get_estimated_frames_list
():
"""
estimate the total frame count from all visual input
"""
# Before processing inputs
estimated_frames_list
=
[]
for
image
in
image_data
:
if
isinstance
(
image
,
str
)
and
image
.
startswith
(
"video:"
):
path
=
image
[
len
(
"video:"
)
:]
# Estimate frames for the video
vr
=
VideoReader
(
path
,
ctx
=
cpu
(
0
))
num_frames
=
len
(
vr
)
else
:
# For images, each contributes one frame
num_frames
=
1
estimated_frames_list
.
append
(
num_frames
)
return
estimated_frames_list
estimated_frames_list
=
get_estimated_frames_list
()
total_frame_count
=
sum
(
estimated_frames_list
)
scaling_factor
=
min
(
1.0
,
MAX_NUM_FRAMES
/
total_frame_count
)
def
encode_video
(
video_path
,
frame_count_limit
=
None
):
if
not
os
.
path
.
exists
(
video_path
):
logger
.
error
(
f
"Video
{
video_path
}
does not exist"
)
return
[]
if
frame_count_limit
==
0
:
return
[]
def
uniform_sample
(
l
,
n
):
gap
=
len
(
l
)
/
n
idxs
=
[
int
(
i
*
gap
+
gap
/
2
)
for
i
in
range
(
n
)]
return
[
l
[
i
]
for
i
in
idxs
]
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
sample_fps
=
round
(
vr
.
get_avg_fps
()
/
1
)
# FPS
frame_idx
=
[
i
for
i
in
range
(
0
,
len
(
vr
),
sample_fps
)]
if
frame_count_limit
is
not
None
and
len
(
frame_idx
)
>
frame_count_limit
:
frame_idx
=
uniform_sample
(
frame_idx
,
frame_count_limit
)
frames
=
vr
.
get_batch
(
frame_idx
).
asnumpy
()
frames
=
[
Image
.
fromarray
(
v
.
astype
(
"uint8"
))
for
v
in
frames
]
return
frames
if
isinstance
(
input_ids
,
list
):
assert
len
(
input_ids
)
and
isinstance
(
input_ids
[
0
],
int
)
input_text
=
self
.
_processor
.
tokenizer
.
decode
(
input_ids
)
else
:
input_text
=
input_ids
# MiniCPMV requires each frame of video as a single image token
text_parts
=
input_text
.
split
(
self
.
IMAGE_TOKEN
)
new_text_parts
=
[]
# Process each input with allocated frames
for
image_index
,
(
image
,
estimated_frames
)
in
enumerate
(
zip
(
image_data
,
estimated_frames_list
)
):
if
len
(
all_frames
)
>=
MAX_NUM_FRAMES
:
frames_to_process
=
0
else
:
frames_to_process
=
max
(
1
,
int
(
estimated_frames
*
scaling_factor
))
if
frames_to_process
==
0
:
frames
=
[]
else
:
try
:
if
isinstance
(
image
,
str
)
and
image
.
startswith
(
"video:"
):
path
=
image
[
len
(
"video:"
)
:]
frames
=
encode_video
(
path
,
frame_count_limit
=
frames_to_process
)
else
:
raw_image
,
_size
=
load_image
(
image
)
frames
=
[
raw_image
]
if
len
(
frames
)
==
0
:
continue
except
FileNotFoundError
as
e
:
print
(
e
)
base_output
=
self
.
load_images
(
max_req_input_len
,
input_ids
,
image_data
,
self
.
IMAGE_TOKEN
)
if
base_output
is
None
:
return
None
image_sizes
+=
frames
[
0
].
size
*
len
(
frames
)
image_hashes
+=
[
hash
(
image
)]
*
len
(
frames
)
all_frames
+=
frames
assert
frames_to_process
==
len
(
frames
)
new_text_parts
.
append
(
text_parts
[
image_index
])
if
frames_to_process
!=
0
:
new_text_parts
.
append
(
self
.
IMAGE_TOKEN
*
len
(
frames
))
new_text_parts
.
append
(
text_parts
[
-
1
])
input_text
=
""
.
join
(
new_text_parts
)
if
len
(
all_frames
)
==
0
:
if
len
(
base_output
.
all_frames
)
==
0
:
return
None
res
=
await
self
.
_process_images
(
images
=
all_frames
,
input_text
=
input_text
)
pixel_values
=
res
[
"pixel_values"
]
tgt_sizes
=
res
[
"tgt_sizes"
]
input_ids
=
res
[
"input_ids"
]
res
=
await
self
.
_process_images
(
images
=
base_output
.
all_frames
,
input_text
=
base_output
.
input_text
)
# Collect special token ids
tokenizer
=
self
.
_processor
.
tokenizer
...
...
@@ -405,10 +433,10 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
slice_start_id
=
[
tokenizer
.
slice_start_id
]
slice_end_id
=
[
tokenizer
.
slice_end_id
]
return
{
"input_ids"
:
input_ids
.
flatten
().
tolist
(),
"pixel_values"
:
pixel_values
,
"tgt_sizes"
:
tgt_sizes
,
"image_hashes"
:
image_hashes
,
"input_ids"
:
res
[
"
input_ids
"
]
.
flatten
().
tolist
(),
"pixel_values"
:
res
[
"
pixel_values
"
]
,
"tgt_sizes"
:
res
[
"
tgt_sizes
"
]
,
"image_hashes"
:
base_output
.
image_hashes
,
"modalities"
:
request_obj
.
modalities
or
[
"image"
],
"im_start_id"
:
im_start_id
,
"im_end_id"
:
im_end_id
,
...
...
@@ -536,13 +564,80 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
}
class
Qwen2_5VLImageProcessor
(
BaseImageProcessor
):
def
__init__
(
self
,
hf_config
,
server_args
,
_processor
):
super
().
__init__
(
hf_config
,
server_args
,
_processor
)
self
.
IMAGE_TOKEN
=
"<|vision_start|><|image_pad|><|vision_end|>"
self
.
IM_START_TOKEN_ID
=
hf_config
.
vision_start_token_id
self
.
IM_END_TOKEN_ID
=
hf_config
.
vision_end_token_id
self
.
NUM_TOKEN_PER_FRAME
=
770
@
staticmethod
def
_process_images_task
(
images
,
input_text
):
result
=
global_processor
.
__call__
(
text
=
input_text
,
images
=
images
,
return_tensors
=
"pt"
)
return
{
"input_ids"
:
result
.
input_ids
,
"pixel_values"
:
result
.
pixel_values
,
"image_grid_thws"
:
result
.
image_grid_thw
,
}
async
def
_process_images
(
self
,
images
,
input_text
)
->
dict
:
if
self
.
executor
is
not
None
:
loop
=
asyncio
.
get_event_loop
()
return
await
loop
.
run_in_executor
(
self
.
executor
,
Qwen2_5VLImageProcessor
.
_process_images_task
,
images
,
input_text
,
)
else
:
return
self
.
_process_images_task
(
images
,
input_text
)
async
def
process_images_async
(
self
,
image_data
:
List
[
Union
[
str
,
bytes
]],
input_ids
,
request_obj
,
max_req_input_len
,
*
args
,
**
kwargs
,
):
if
not
image_data
:
return
None
if
isinstance
(
image_data
,
str
):
image_data
=
[
image_data
]
image_token
=
self
.
IMAGE_TOKEN
base_output
=
self
.
load_images
(
max_req_input_len
,
input_ids
,
image_data
,
image_token
)
ret
=
await
self
.
_process_images
(
base_output
.
all_frames
,
base_output
.
input_text
)
return
{
"input_ids"
:
ret
[
"input_ids"
].
flatten
().
tolist
(),
"pixel_values"
:
ret
[
"pixel_values"
],
"image_hashes"
:
base_output
.
image_hashes
,
"modalities"
:
request_obj
.
modalities
or
[
"image"
],
"image_grid_thws"
:
ret
[
"image_grid_thws"
],
"im_start_id"
:
self
.
IM_START_TOKEN_ID
,
"im_end_id"
:
self
.
IM_END_TOKEN_ID
,
}
def
get_image_processor
(
hf_config
,
server_args
:
ServerArgs
,
processor
)
->
BaseImageProcessor
:
if
"MllamaForConditionalGeneration"
in
hf_config
.
architectures
:
return
MllamaImageProcessor
(
hf_config
,
server_args
,
processor
)
elif
"Qwen2VLForConditionalGeneration"
in
hf_config
.
architectures
:
return
Qwen2VLImageProcessor
(
hf_config
,
server_args
,
processor
.
image_processor
)
return
Qwen2VLImageProcessor
(
hf_config
,
server_args
,
processor
)
elif
"Qwen2_5_VLForConditionalGeneration"
in
hf_config
.
architectures
:
return
Qwen2_5VLImageProcessor
(
hf_config
,
server_args
,
processor
)
elif
"MiniCPMV"
in
hf_config
.
architectures
:
return
MiniCPMVImageProcessor
(
hf_config
,
server_args
,
processor
)
else
:
...
...
python/sglang/srt/models/qwen2_5_vl.py
0 → 100644
View file @
bcc213df
This diff is collapsed.
Click to expand it.
python/sglang/srt/models/qwen2_vl.py
View file @
bcc213df
...
...
@@ -31,8 +31,9 @@ import torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
einops
import
rearrange
from
transformers
import
Qwen2VLConfig
from
transformers.models.qwen2_vl.configuration_qwen2_vl
import
Qwen2VLVisionConfig
from
sglang.srt.configs
import
Qwen2VLConfig
,
Qwen2VLVisionConfig
from
sglang.srt.hf_transformers_utils
import
get_processor
from
sglang.srt.layers.activation
import
QuickGELU
from
sglang.srt.layers.attention.vision
import
VisionAttention
...
...
test/srt/test_vision_openai_server.py
View file @
bcc213df
...
...
@@ -252,6 +252,18 @@ class TestOpenAIVisionServer(unittest.TestCase):
print
(
"-"
*
30
)
# Add assertions to validate the video response
assert
"iPod"
in
video_response
or
"device"
in
video_response
,
video_response
assert
(
"man"
in
video_response
or
"person"
in
video_response
or
"individual"
in
video_response
),
video_response
assert
(
"present"
in
video_response
or
"examine"
in
video_response
or
"display"
in
video_response
)
assert
"black"
in
video_response
or
"dark"
in
video_response
self
.
assertIsNotNone
(
video_response
)
self
.
assertGreater
(
len
(
video_response
),
0
)
...
...
@@ -366,6 +378,30 @@ class TestQWen2VLServer(TestOpenAIVisionServer):
cls
.
base_url
+=
"/v1"
class
TestQWen2_5_VLServer
(
TestOpenAIVisionServer
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
"Qwen/Qwen2.5-VL-7B-Instruct"
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
api_key
=
"sk-123456"
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
api_key
=
cls
.
api_key
,
other_args
=
[
"--chat-template"
,
"qwen2-vl"
,
# FIXME: workaround to chunked prefill within image embeds
"--chunked-prefill-size"
,
"10000"
,
"--mem-fraction-static"
,
"0.4"
,
],
)
cls
.
base_url
+=
"/v1"
class
TestQWen2VLServerContextLengthIssue
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment