Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
bcc213df
Unverified
Commit
bcc213df
authored
Feb 16, 2025
by
Mick
Committed by
GitHub
Feb 16, 2025
Browse files
Model: Support Qwen 2.5 vl (#3258)
parent
39416e39
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1999 additions
and
261 deletions
+1999
-261
docs/references/supported_models.md
docs/references/supported_models.md
+2
-2
python/sglang/lang/chat_template.py
python/sglang/lang/chat_template.py
+8
-0
python/sglang/srt/configs/__init__.py
python/sglang/srt/configs/__init__.py
+6
-3
python/sglang/srt/configs/model_config.py
python/sglang/srt/configs/model_config.py
+1
-0
python/sglang/srt/configs/qwen2_5_vl_config.py
python/sglang/srt/configs/qwen2_5_vl_config.py
+1003
-0
python/sglang/srt/configs/qwen2vl.py
python/sglang/srt/configs/qwen2vl.py
+0
-130
python/sglang/srt/hf_transformers_utils.py
python/sglang/srt/hf_transformers_utils.py
+2
-3
python/sglang/srt/managers/image_processor.py
python/sglang/srt/managers/image_processor.py
+217
-122
python/sglang/srt/models/qwen2_5_vl.py
python/sglang/srt/models/qwen2_5_vl.py
+722
-0
python/sglang/srt/models/qwen2_vl.py
python/sglang/srt/models/qwen2_vl.py
+2
-1
test/srt/test_vision_openai_server.py
test/srt/test_vision_openai_server.py
+36
-0
No files found.
docs/references/supported_models.md
View file @
bcc213df
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
-
Llama / Llama 2 / Llama 3 / Llama 3.1 / Llama 3.2
-
Llama / Llama 2 / Llama 3 / Llama 3.1 / Llama 3.2
-
Mistral / Mixtral / Mistral NeMo / Mistral Small 3
-
Mistral / Mixtral / Mistral NeMo / Mistral Small 3
-
Gemma / Gemma 2
-
Gemma / Gemma 2
-
Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
-
Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
/ Qwen 2.5 VL
-
DeepSeek / DeepSeek 2 /
[
DeepSeek 3
](
https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3
)
-
DeepSeek / DeepSeek 2 /
[
DeepSeek 3
](
https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3
)
-
OLMoE
-
OLMoE
-
[
LLaVA-OneVision
](
https://llava-vl.github.io/blog/2024-08-05-llava-onevision/
)
-
[
LLaVA-OneVision
](
https://llava-vl.github.io/blog/2024-08-05-llava-onevision/
)
...
@@ -54,7 +54,7 @@ To support a new model in SGLang, you only need to add a single file under [SGLa
...
@@ -54,7 +54,7 @@ To support a new model in SGLang, you only need to add a single file under [SGLa
You can learn from existing model implementations and create new files for the new models.
You can learn from existing model implementations and create new files for the new models.
For most models, you should be able to find a similar model to start with (e.g., starting from Llama).
For most models, you should be able to find a similar model to start with (e.g., starting from Llama).
## How to Support a New v
ision L
LM
## How to Support a New vLM
To support a new vision-language model (vLM) in SGLang, there are several key components in addition to the standard LLM.
To support a new vision-language model (vLM) in SGLang, there are several key components in addition to the standard LLM.
...
...
python/sglang/lang/chat_template.py
View file @
bcc213df
...
@@ -427,6 +427,8 @@ def match_chat_ml(model_path: str):
...
@@ -427,6 +427,8 @@ def match_chat_ml(model_path: str):
if
"tinyllama"
in
model_path
:
if
"tinyllama"
in
model_path
:
return
get_chat_template
(
"chatml"
)
return
get_chat_template
(
"chatml"
)
# Now the suffix for qwen2 chat model is "instruct"
# Now the suffix for qwen2 chat model is "instruct"
if
"qwen"
in
model_path
and
"vl"
in
model_path
:
return
get_chat_template
(
"qwen2-vl"
)
if
"qwen"
in
model_path
:
if
"qwen"
in
model_path
:
if
"vl"
in
model_path
:
if
"vl"
in
model_path
:
return
get_chat_template
(
"qwen2-vl"
)
return
get_chat_template
(
"qwen2-vl"
)
...
@@ -443,6 +445,12 @@ def match_chat_ml(model_path: str):
...
@@ -443,6 +445,12 @@ def match_chat_ml(model_path: str):
return
get_chat_template
(
"chatml-llava"
)
return
get_chat_template
(
"chatml-llava"
)
@
register_chat_template_matching_function
def
match_chat_minicpm
(
model_path
:
str
):
if
"minicpm"
in
model_path
:
return
get_chat_template
(
"minicpmv"
)
@
register_chat_template_matching_function
@
register_chat_template_matching_function
def
match_chat_yi
(
model_path
:
str
):
def
match_chat_yi
(
model_path
:
str
):
model_path
=
model_path
.
lower
()
model_path
=
model_path
.
lower
()
...
...
python/sglang/srt/configs/__init__.py
View file @
bcc213df
from
sglang.srt.configs.chatglm
import
ChatGLMConfig
from
sglang.srt.configs.chatglm
import
ChatGLMConfig
from
sglang.srt.configs.dbrx
import
DbrxConfig
from
sglang.srt.configs.dbrx
import
DbrxConfig
from
sglang.srt.configs.exaone
import
ExaoneConfig
from
sglang.srt.configs.exaone
import
ExaoneConfig
from
sglang.srt.configs.qwen2vl
import
Qwen2VLConfig
,
Qwen2VLVisionConfig
from
sglang.srt.configs.qwen2_5_vl_config
import
(
Qwen2_5_VLConfig
,
Qwen2_5_VLVisionConfig
,
)
__all__
=
[
__all__
=
[
"ExaoneConfig"
,
"ExaoneConfig"
,
"Qwen2VLConfig"
,
"Qwen2VLVisionConfig"
,
"ChatGLMConfig"
,
"ChatGLMConfig"
,
"DbrxConfig"
,
"DbrxConfig"
,
"Qwen2_5_VLConfig"
,
"Qwen2_5_VLVisionConfig"
,
]
]
python/sglang/srt/configs/model_config.py
View file @
bcc213df
...
@@ -403,6 +403,7 @@ def is_multimodal_model(model_architectures: List[str]):
...
@@ -403,6 +403,7 @@ def is_multimodal_model(model_architectures: List[str]):
or
"LlavaVidForCausalLM"
in
model_architectures
or
"LlavaVidForCausalLM"
in
model_architectures
or
"MllamaForConditionalGeneration"
in
model_architectures
or
"MllamaForConditionalGeneration"
in
model_architectures
or
"Qwen2VLForConditionalGeneration"
in
model_architectures
or
"Qwen2VLForConditionalGeneration"
in
model_architectures
or
"Qwen2_5_VLForConditionalGeneration"
in
model_architectures
or
"MiniCPMV"
in
model_architectures
or
"MiniCPMV"
in
model_architectures
):
):
return
True
return
True
...
...
python/sglang/srt/configs/qwen2_5_vl_config.py
0 → 100644
View file @
bcc213df
This diff is collapsed.
Click to expand it.
python/sglang/srt/configs/qwen2vl.py
deleted
100644 → 0
View file @
39416e39
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen2VL model configuration"""
import
os
from
typing
import
Union
from
transformers
import
PretrainedConfig
class
Qwen2VLVisionConfig
(
PretrainedConfig
):
model_type
=
"qwen2_vl"
def
__init__
(
self
,
depth
=
32
,
embed_dim
=
1280
,
hidden_size
=
3584
,
hidden_act
=
"quick_gelu"
,
mlp_ratio
=
4
,
num_heads
=
16
,
in_channels
=
3
,
patch_size
=
14
,
spatial_merge_size
=
2
,
temporal_patch_size
=
2
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
self
.
depth
=
depth
self
.
embed_dim
=
embed_dim
self
.
hidden_size
=
hidden_size
self
.
hidden_act
=
hidden_act
self
.
mlp_ratio
=
mlp_ratio
self
.
num_heads
=
num_heads
self
.
in_channels
=
in_channels
self
.
patch_size
=
patch_size
self
.
spatial_merge_size
=
spatial_merge_size
self
.
temporal_patch_size
=
temporal_patch_size
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
:
Union
[
str
,
os
.
PathLike
],
**
kwargs
)
->
"PretrainedConfig"
:
cls
.
_set_token_in_kwargs
(
kwargs
)
config_dict
,
kwargs
=
cls
.
get_config_dict
(
pretrained_model_name_or_path
,
**
kwargs
)
if
config_dict
.
get
(
"model_type"
)
==
"qwen2_vl"
:
config_dict
=
config_dict
[
"vision_config"
]
return
cls
.
from_dict
(
config_dict
,
**
kwargs
)
class
Qwen2VLConfig
(
PretrainedConfig
):
model_type
=
"qwen2_vl"
def
__init__
(
self
,
vocab_size
=
152064
,
hidden_size
=
8192
,
intermediate_size
=
29568
,
num_hidden_layers
=
80
,
num_attention_heads
=
64
,
num_key_value_heads
=
8
,
hidden_act
=
"silu"
,
max_position_embeddings
=
32768
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-05
,
use_cache
=
True
,
tie_word_embeddings
=
False
,
rope_theta
=
1000000.0
,
use_sliding_window
=
False
,
sliding_window
=
4096
,
max_window_layers
=
80
,
attention_dropout
=
0.0
,
vision_config
=
None
,
rope_scaling
=
None
,
**
kwargs
,
):
if
isinstance
(
vision_config
,
dict
):
self
.
vision_config
=
Qwen2VLVisionConfig
(
**
vision_config
)
elif
vision_config
is
None
:
self
.
vision_config
=
Qwen2VLVisionConfig
()
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
use_sliding_window
=
use_sliding_window
self
.
sliding_window
=
sliding_window
self
.
max_window_layers
=
max_window_layers
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
attention_dropout
=
attention_dropout
self
.
rope_scaling
=
rope_scaling
# NOTE(HandH1998): This is necessary for configuring the `rope_type`` of qwen2vl models after removing dependencies on vllm.
if
self
.
rope_scaling
is
not
None
and
"type"
in
self
.
rope_scaling
:
if
self
.
rope_scaling
[
"type"
]
==
"mrope"
:
self
.
rope_scaling
[
"type"
]
=
"default"
self
.
rope_scaling
[
"rope_type"
]
=
self
.
rope_scaling
[
"type"
]
super
().
__init__
(
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
)
python/sglang/srt/hf_transformers_utils.py
View file @
bcc213df
...
@@ -30,16 +30,15 @@ from transformers import (
...
@@ -30,16 +30,15 @@ from transformers import (
)
)
from
transformers.models.auto.modeling_auto
import
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from
transformers.models.auto.modeling_auto
import
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from
sglang.srt.configs
import
ChatGLMConfig
,
DbrxConfig
,
ExaoneConfig
,
Qwen2VLConfig
from
sglang.srt.configs
import
ChatGLMConfig
,
DbrxConfig
,
ExaoneConfig
,
Qwen2
_5_
VLConfig
_CONFIG_REGISTRY
:
Dict
[
str
,
Type
[
PretrainedConfig
]]
=
{
_CONFIG_REGISTRY
:
Dict
[
str
,
Type
[
PretrainedConfig
]]
=
{
ChatGLMConfig
.
model_type
:
ChatGLMConfig
,
ChatGLMConfig
.
model_type
:
ChatGLMConfig
,
DbrxConfig
.
model_type
:
DbrxConfig
,
DbrxConfig
.
model_type
:
DbrxConfig
,
ExaoneConfig
.
model_type
:
ExaoneConfig
,
ExaoneConfig
.
model_type
:
ExaoneConfig
,
Qwen2VLConfig
.
model_type
:
Qwen2VLConfig
,
Qwen2
_5_
VLConfig
.
model_type
:
Qwen2
_5_
VLConfig
,
}
}
for
name
,
cls
in
_CONFIG_REGISTRY
.
items
():
for
name
,
cls
in
_CONFIG_REGISTRY
.
items
():
with
contextlib
.
suppress
(
ValueError
):
with
contextlib
.
suppress
(
ValueError
):
AutoConfig
.
register
(
name
,
cls
)
AutoConfig
.
register
(
name
,
cls
)
...
...
python/sglang/srt/managers/image_processor.py
View file @
bcc213df
# TODO: also move pad_input_ids into this module
# TODO: also move pad_input_ids into this module
import
asyncio
import
asyncio
import
concurrent.futures
import
concurrent.futures
import
dataclasses
import
logging
import
logging
import
multiprocessing
as
mp
import
multiprocessing
as
mp
import
os
import
os
...
@@ -8,6 +9,7 @@ from abc import ABC, abstractmethod
...
@@ -8,6 +9,7 @@ from abc import ABC, abstractmethod
from
typing
import
List
,
Optional
,
Union
from
typing
import
List
,
Optional
,
Union
import
numpy
as
np
import
numpy
as
np
import
PIL
import
transformers
import
transformers
from
decord
import
VideoReader
,
cpu
from
decord
import
VideoReader
,
cpu
from
PIL
import
Image
from
PIL
import
Image
...
@@ -34,11 +36,22 @@ def init_global_processor(server_args: ServerArgs):
...
@@ -34,11 +36,22 @@ def init_global_processor(server_args: ServerArgs):
)
)
@
dataclasses
.
dataclass
class
BaseImageProcessorOutput
:
image_hashes
:
list
[
int
]
image_sizes
:
list
[
int
]
all_frames
:
[
PIL
.
Image
]
# input_text, with each frame of video/image represented with a image_token
input_text
:
str
class
BaseImageProcessor
(
ABC
):
class
BaseImageProcessor
(
ABC
):
def
__init__
(
self
,
hf_config
,
server_args
,
_processor
):
def
__init__
(
self
,
hf_config
,
server_args
,
_processor
):
self
.
hf_config
=
hf_config
self
.
hf_config
=
hf_config
self
.
_processor
=
_processor
self
.
_processor
=
_processor
self
.
server_args
=
server_args
self
.
server_args
=
server_args
# FIXME: not accurate, model and image specific
self
.
NUM_TOKEN_PER_FRAME
=
330
self
.
executor
=
concurrent
.
futures
.
ProcessPoolExecutor
(
self
.
executor
=
concurrent
.
futures
.
ProcessPoolExecutor
(
initializer
=
init_global_processor
,
initializer
=
init_global_processor
,
...
@@ -48,9 +61,128 @@ class BaseImageProcessor(ABC):
...
@@ -48,9 +61,128 @@ class BaseImageProcessor(ABC):
)
)
@
abstractmethod
@
abstractmethod
async
def
process_images_async
(
self
,
image_data
,
input_text
,
**
kwargs
):
async
def
process_images_async
(
self
,
image_data
,
input_text
,
max_req_input_len
,
**
kwargs
):
pass
pass
def
get_estimated_frames_list
(
self
,
image_data
):
"""
estimate the total frame count from all visual input
"""
# Before processing inputs
estimated_frames_list
=
[]
for
image
in
image_data
:
if
isinstance
(
image
,
str
)
and
image
.
startswith
(
"video:"
):
path
=
image
[
len
(
"video:"
)
:]
# Estimate frames for the video
vr
=
VideoReader
(
path
,
ctx
=
cpu
(
0
))
num_frames
=
len
(
vr
)
else
:
# For images, each contributes one frame
num_frames
=
1
estimated_frames_list
.
append
(
num_frames
)
return
estimated_frames_list
def
encode_video
(
self
,
video_path
,
frame_count_limit
=
None
):
if
not
os
.
path
.
exists
(
video_path
):
logger
.
error
(
f
"Video
{
video_path
}
does not exist"
)
return
[]
if
frame_count_limit
==
0
:
return
[]
def
uniform_sample
(
l
,
n
):
gap
=
len
(
l
)
/
n
idxs
=
[
int
(
i
*
gap
+
gap
/
2
)
for
i
in
range
(
n
)]
return
[
l
[
i
]
for
i
in
idxs
]
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
sample_fps
=
round
(
vr
.
get_avg_fps
()
/
1
)
# FPS
frame_idx
=
[
i
for
i
in
range
(
0
,
len
(
vr
),
sample_fps
)]
if
frame_count_limit
is
not
None
and
len
(
frame_idx
)
>
frame_count_limit
:
frame_idx
=
uniform_sample
(
frame_idx
,
frame_count_limit
)
frames
=
vr
.
get_batch
(
frame_idx
).
asnumpy
()
frames
=
[
Image
.
fromarray
(
v
.
astype
(
"uint8"
))
for
v
in
frames
]
return
frames
def
load_images
(
self
,
max_req_input_len
:
int
,
input_ids
:
list
,
image_data
,
image_token
:
str
,
)
->
BaseImageProcessorOutput
:
"""
Each frame of video/image will be replaced by a single image token
"""
image_hashes
,
image_sizes
=
[],
[]
all_frames
=
[]
new_text_parts
=
[]
if
isinstance
(
input_ids
,
list
):
assert
len
(
input_ids
)
and
isinstance
(
input_ids
[
0
],
int
)
input_text
=
self
.
_processor
.
tokenizer
.
decode
(
input_ids
)
else
:
input_text
=
input_ids
text_parts
=
input_text
.
split
(
image_token
)
# roughly calculate the max number of frames under the max_req_input_len limit
def
calculate_max_num_frames
()
->
int
:
ret
=
(
max_req_input_len
-
len
(
input_ids
))
//
self
.
NUM_TOKEN_PER_FRAME
return
min
(
ret
,
100
)
MAX_NUM_FRAMES
=
calculate_max_num_frames
()
estimated_frames_list
=
self
.
get_estimated_frames_list
(
image_data
=
image_data
)
total_frame_count
=
sum
(
estimated_frames_list
)
# a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
# e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
scaling_factor
=
min
(
1.0
,
MAX_NUM_FRAMES
/
total_frame_count
)
# Process each input with allocated frames
for
image_index
,
(
image
,
estimated_frames
)
in
enumerate
(
zip
(
image_data
,
estimated_frames_list
)
):
if
len
(
all_frames
)
>=
MAX_NUM_FRAMES
:
frames_to_process
=
0
else
:
frames_to_process
=
max
(
1
,
int
(
estimated_frames
*
scaling_factor
))
if
frames_to_process
==
0
:
frames
=
[]
else
:
try
:
if
isinstance
(
image
,
str
)
and
image
.
startswith
(
"video:"
):
path
=
image
[
len
(
"video:"
)
:]
frames
=
self
.
encode_video
(
path
,
frame_count_limit
=
frames_to_process
)
else
:
raw_image
,
_size
=
load_image
(
image
)
frames
=
[
raw_image
]
if
len
(
frames
)
==
0
:
continue
except
FileNotFoundError
as
e
:
print
(
e
)
return
None
image_sizes
+=
frames
[
0
].
size
*
len
(
frames
)
image_hashes
+=
[
hash
(
image
)]
*
len
(
frames
)
all_frames
+=
frames
new_text_parts
.
append
(
text_parts
[
image_index
])
if
frames_to_process
!=
0
:
new_text_parts
.
append
(
image_token
*
len
(
frames
))
assert
frames_to_process
==
len
(
frames
)
new_text_parts
.
append
(
text_parts
[
-
1
])
input_text
=
""
.
join
(
new_text_parts
)
return
BaseImageProcessorOutput
(
image_hashes
,
image_sizes
,
all_frames
,
input_text
)
class
DummyImageProcessor
(
BaseImageProcessor
):
class
DummyImageProcessor
(
BaseImageProcessor
):
def
__init__
(
self
):
def
__init__
(
self
):
...
@@ -248,9 +380,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
...
@@ -248,9 +380,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
text
=
input_text
,
images
=
images
,
return_tensors
=
"pt"
text
=
input_text
,
images
=
images
,
return_tensors
=
"pt"
)
)
return
{
return
{
"input_ids"
:
result
[
"
input_ids
"
]
,
"input_ids"
:
result
.
input_ids
,
"pixel_values"
:
result
[
"
pixel_values
"
]
,
"pixel_values"
:
result
.
pixel_values
,
"tgt_sizes"
:
result
[
"
tgt_sizes
"
]
,
"tgt_sizes"
:
result
.
tgt_sizes
,
}
}
async
def
_process_images
(
self
,
images
,
input_text
):
async
def
_process_images
(
self
,
images
,
input_text
):
...
@@ -278,124 +410,20 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
...
@@ -278,124 +410,20 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
):
):
if
not
image_data
:
if
not
image_data
:
return
None
return
None
if
not
isinstance
(
image_data
,
list
):
if
not
isinstance
(
image_data
,
list
):
image_data
=
[
image_data
]
image_data
=
[
image_data
]
image_hashes
,
image_sizes
=
[],
[]
base_output
=
self
.
load_images
(
all_frames
=
[]
max_req_input_len
,
input_ids
,
image_data
,
self
.
IMAGE_TOKEN
)
# roughly calculate the max number of frames under the max_req_input_len limit
if
base_output
is
None
:
def
calculate_max_num_frames
()
->
int
:
return
None
# Model-specific
NUM_TOKEN_PER_FRAME
=
330
ret
=
(
max_req_input_len
-
len
(
input_ids
))
//
NUM_TOKEN_PER_FRAME
return
min
(
ret
,
100
)
MAX_NUM_FRAMES
=
calculate_max_num_frames
()
# print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
def
get_estimated_frames_list
():
"""
estimate the total frame count from all visual input
"""
# Before processing inputs
estimated_frames_list
=
[]
for
image
in
image_data
:
if
isinstance
(
image
,
str
)
and
image
.
startswith
(
"video:"
):
path
=
image
[
len
(
"video:"
)
:]
# Estimate frames for the video
vr
=
VideoReader
(
path
,
ctx
=
cpu
(
0
))
num_frames
=
len
(
vr
)
else
:
# For images, each contributes one frame
num_frames
=
1
estimated_frames_list
.
append
(
num_frames
)
return
estimated_frames_list
estimated_frames_list
=
get_estimated_frames_list
()
total_frame_count
=
sum
(
estimated_frames_list
)
scaling_factor
=
min
(
1.0
,
MAX_NUM_FRAMES
/
total_frame_count
)
def
encode_video
(
video_path
,
frame_count_limit
=
None
):
if
not
os
.
path
.
exists
(
video_path
):
logger
.
error
(
f
"Video
{
video_path
}
does not exist"
)
return
[]
if
frame_count_limit
==
0
:
return
[]
def
uniform_sample
(
l
,
n
):
gap
=
len
(
l
)
/
n
idxs
=
[
int
(
i
*
gap
+
gap
/
2
)
for
i
in
range
(
n
)]
return
[
l
[
i
]
for
i
in
idxs
]
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
sample_fps
=
round
(
vr
.
get_avg_fps
()
/
1
)
# FPS
frame_idx
=
[
i
for
i
in
range
(
0
,
len
(
vr
),
sample_fps
)]
if
frame_count_limit
is
not
None
and
len
(
frame_idx
)
>
frame_count_limit
:
frame_idx
=
uniform_sample
(
frame_idx
,
frame_count_limit
)
frames
=
vr
.
get_batch
(
frame_idx
).
asnumpy
()
frames
=
[
Image
.
fromarray
(
v
.
astype
(
"uint8"
))
for
v
in
frames
]
return
frames
if
isinstance
(
input_ids
,
list
):
assert
len
(
input_ids
)
and
isinstance
(
input_ids
[
0
],
int
)
input_text
=
self
.
_processor
.
tokenizer
.
decode
(
input_ids
)
else
:
input_text
=
input_ids
# MiniCPMV requires each frame of video as a single image token
text_parts
=
input_text
.
split
(
self
.
IMAGE_TOKEN
)
new_text_parts
=
[]
# Process each input with allocated frames
for
image_index
,
(
image
,
estimated_frames
)
in
enumerate
(
zip
(
image_data
,
estimated_frames_list
)
):
if
len
(
all_frames
)
>=
MAX_NUM_FRAMES
:
frames_to_process
=
0
else
:
frames_to_process
=
max
(
1
,
int
(
estimated_frames
*
scaling_factor
))
if
frames_to_process
==
0
:
frames
=
[]
else
:
try
:
if
isinstance
(
image
,
str
)
and
image
.
startswith
(
"video:"
):
path
=
image
[
len
(
"video:"
)
:]
frames
=
encode_video
(
path
,
frame_count_limit
=
frames_to_process
)
else
:
raw_image
,
_size
=
load_image
(
image
)
frames
=
[
raw_image
]
if
len
(
frames
)
==
0
:
continue
except
FileNotFoundError
as
e
:
print
(
e
)
return
None
image_sizes
+=
frames
[
0
].
size
*
len
(
frames
)
image_hashes
+=
[
hash
(
image
)]
*
len
(
frames
)
all_frames
+=
frames
assert
frames_to_process
==
len
(
frames
)
new_text_parts
.
append
(
text_parts
[
image_index
])
if
frames_to_process
!=
0
:
new_text_parts
.
append
(
self
.
IMAGE_TOKEN
*
len
(
frames
))
new_text_parts
.
append
(
text_parts
[
-
1
])
input_text
=
""
.
join
(
new_text_parts
)
if
len
(
all_frames
)
==
0
:
if
len
(
base_output
.
all_frames
)
==
0
:
return
None
return
None
res
=
await
self
.
_process_images
(
images
=
all_frames
,
input_text
=
input_text
)
res
=
await
self
.
_process_images
(
pixel_values
=
res
[
"pixel_values"
]
images
=
base_output
.
all_frames
,
input_text
=
base_output
.
input_text
tgt_sizes
=
res
[
"tgt_sizes"
]
)
input_ids
=
res
[
"input_ids"
]
# Collect special token ids
# Collect special token ids
tokenizer
=
self
.
_processor
.
tokenizer
tokenizer
=
self
.
_processor
.
tokenizer
...
@@ -405,10 +433,10 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
...
@@ -405,10 +433,10 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
slice_start_id
=
[
tokenizer
.
slice_start_id
]
slice_start_id
=
[
tokenizer
.
slice_start_id
]
slice_end_id
=
[
tokenizer
.
slice_end_id
]
slice_end_id
=
[
tokenizer
.
slice_end_id
]
return
{
return
{
"input_ids"
:
input_ids
.
flatten
().
tolist
(),
"input_ids"
:
res
[
"
input_ids
"
]
.
flatten
().
tolist
(),
"pixel_values"
:
pixel_values
,
"pixel_values"
:
res
[
"
pixel_values
"
]
,
"tgt_sizes"
:
tgt_sizes
,
"tgt_sizes"
:
res
[
"
tgt_sizes
"
]
,
"image_hashes"
:
image_hashes
,
"image_hashes"
:
base_output
.
image_hashes
,
"modalities"
:
request_obj
.
modalities
or
[
"image"
],
"modalities"
:
request_obj
.
modalities
or
[
"image"
],
"im_start_id"
:
im_start_id
,
"im_start_id"
:
im_start_id
,
"im_end_id"
:
im_end_id
,
"im_end_id"
:
im_end_id
,
...
@@ -536,13 +564,80 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
...
@@ -536,13 +564,80 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
}
}
class
Qwen2_5VLImageProcessor
(
BaseImageProcessor
):
def
__init__
(
self
,
hf_config
,
server_args
,
_processor
):
super
().
__init__
(
hf_config
,
server_args
,
_processor
)
self
.
IMAGE_TOKEN
=
"<|vision_start|><|image_pad|><|vision_end|>"
self
.
IM_START_TOKEN_ID
=
hf_config
.
vision_start_token_id
self
.
IM_END_TOKEN_ID
=
hf_config
.
vision_end_token_id
self
.
NUM_TOKEN_PER_FRAME
=
770
@
staticmethod
def
_process_images_task
(
images
,
input_text
):
result
=
global_processor
.
__call__
(
text
=
input_text
,
images
=
images
,
return_tensors
=
"pt"
)
return
{
"input_ids"
:
result
.
input_ids
,
"pixel_values"
:
result
.
pixel_values
,
"image_grid_thws"
:
result
.
image_grid_thw
,
}
async
def
_process_images
(
self
,
images
,
input_text
)
->
dict
:
if
self
.
executor
is
not
None
:
loop
=
asyncio
.
get_event_loop
()
return
await
loop
.
run_in_executor
(
self
.
executor
,
Qwen2_5VLImageProcessor
.
_process_images_task
,
images
,
input_text
,
)
else
:
return
self
.
_process_images_task
(
images
,
input_text
)
async
def
process_images_async
(
self
,
image_data
:
List
[
Union
[
str
,
bytes
]],
input_ids
,
request_obj
,
max_req_input_len
,
*
args
,
**
kwargs
,
):
if
not
image_data
:
return
None
if
isinstance
(
image_data
,
str
):
image_data
=
[
image_data
]
image_token
=
self
.
IMAGE_TOKEN
base_output
=
self
.
load_images
(
max_req_input_len
,
input_ids
,
image_data
,
image_token
)
ret
=
await
self
.
_process_images
(
base_output
.
all_frames
,
base_output
.
input_text
)
return
{
"input_ids"
:
ret
[
"input_ids"
].
flatten
().
tolist
(),
"pixel_values"
:
ret
[
"pixel_values"
],
"image_hashes"
:
base_output
.
image_hashes
,
"modalities"
:
request_obj
.
modalities
or
[
"image"
],
"image_grid_thws"
:
ret
[
"image_grid_thws"
],
"im_start_id"
:
self
.
IM_START_TOKEN_ID
,
"im_end_id"
:
self
.
IM_END_TOKEN_ID
,
}
def
get_image_processor
(
def
get_image_processor
(
hf_config
,
server_args
:
ServerArgs
,
processor
hf_config
,
server_args
:
ServerArgs
,
processor
)
->
BaseImageProcessor
:
)
->
BaseImageProcessor
:
if
"MllamaForConditionalGeneration"
in
hf_config
.
architectures
:
if
"MllamaForConditionalGeneration"
in
hf_config
.
architectures
:
return
MllamaImageProcessor
(
hf_config
,
server_args
,
processor
)
return
MllamaImageProcessor
(
hf_config
,
server_args
,
processor
)
elif
"Qwen2VLForConditionalGeneration"
in
hf_config
.
architectures
:
elif
"Qwen2VLForConditionalGeneration"
in
hf_config
.
architectures
:
return
Qwen2VLImageProcessor
(
hf_config
,
server_args
,
processor
.
image_processor
)
return
Qwen2VLImageProcessor
(
hf_config
,
server_args
,
processor
)
elif
"Qwen2_5_VLForConditionalGeneration"
in
hf_config
.
architectures
:
return
Qwen2_5VLImageProcessor
(
hf_config
,
server_args
,
processor
)
elif
"MiniCPMV"
in
hf_config
.
architectures
:
elif
"MiniCPMV"
in
hf_config
.
architectures
:
return
MiniCPMVImageProcessor
(
hf_config
,
server_args
,
processor
)
return
MiniCPMVImageProcessor
(
hf_config
,
server_args
,
processor
)
else
:
else
:
...
...
python/sglang/srt/models/qwen2_5_vl.py
0 → 100644
View file @
bcc213df
This diff is collapsed.
Click to expand it.
python/sglang/srt/models/qwen2_vl.py
View file @
bcc213df
...
@@ -31,8 +31,9 @@ import torch
...
@@ -31,8 +31,9 @@ import torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
einops
import
rearrange
from
einops
import
rearrange
from
transformers
import
Qwen2VLConfig
from
transformers.models.qwen2_vl.configuration_qwen2_vl
import
Qwen2VLVisionConfig
from
sglang.srt.configs
import
Qwen2VLConfig
,
Qwen2VLVisionConfig
from
sglang.srt.hf_transformers_utils
import
get_processor
from
sglang.srt.hf_transformers_utils
import
get_processor
from
sglang.srt.layers.activation
import
QuickGELU
from
sglang.srt.layers.activation
import
QuickGELU
from
sglang.srt.layers.attention.vision
import
VisionAttention
from
sglang.srt.layers.attention.vision
import
VisionAttention
...
...
test/srt/test_vision_openai_server.py
View file @
bcc213df
...
@@ -252,6 +252,18 @@ class TestOpenAIVisionServer(unittest.TestCase):
...
@@ -252,6 +252,18 @@ class TestOpenAIVisionServer(unittest.TestCase):
print
(
"-"
*
30
)
print
(
"-"
*
30
)
# Add assertions to validate the video response
# Add assertions to validate the video response
assert
"iPod"
in
video_response
or
"device"
in
video_response
,
video_response
assert
(
"man"
in
video_response
or
"person"
in
video_response
or
"individual"
in
video_response
),
video_response
assert
(
"present"
in
video_response
or
"examine"
in
video_response
or
"display"
in
video_response
)
assert
"black"
in
video_response
or
"dark"
in
video_response
self
.
assertIsNotNone
(
video_response
)
self
.
assertIsNotNone
(
video_response
)
self
.
assertGreater
(
len
(
video_response
),
0
)
self
.
assertGreater
(
len
(
video_response
),
0
)
...
@@ -366,6 +378,30 @@ class TestQWen2VLServer(TestOpenAIVisionServer):
...
@@ -366,6 +378,30 @@ class TestQWen2VLServer(TestOpenAIVisionServer):
cls
.
base_url
+=
"/v1"
cls
.
base_url
+=
"/v1"
class
TestQWen2_5_VLServer
(
TestOpenAIVisionServer
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
"Qwen/Qwen2.5-VL-7B-Instruct"
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
api_key
=
"sk-123456"
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
api_key
=
cls
.
api_key
,
other_args
=
[
"--chat-template"
,
"qwen2-vl"
,
# FIXME: workaround to chunked prefill within image embeds
"--chunked-prefill-size"
,
"10000"
,
"--mem-fraction-static"
,
"0.4"
,
],
)
cls
.
base_url
+=
"/v1"
class
TestQWen2VLServerContextLengthIssue
(
unittest
.
TestCase
):
class
TestQWen2VLServerContextLengthIssue
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment