Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
b5e3d603
Unverified
Commit
b5e3d603
authored
Jul 10, 2025
by
Mick
Committed by
GitHub
Jul 09, 2025
Browse files
vlm: support video as an input modality (#5888)
parent
4ed57807
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
160 additions
and
122 deletions
+160
-122
test/srt/test_vision_openai_server_common.py
test/srt/test_vision_openai_server_common.py
+55
-10
test/srt/test_vlm_accuracy.py
test/srt/test_vlm_accuracy.py
+105
-112
No files found.
test/srt/test_vision_openai_server_common.py
View file @
b5e3d603
...
@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
...
@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
assert
response
.
usage
.
completion_tokens
>
0
assert
response
.
usage
.
completion_tokens
>
0
assert
response
.
usage
.
total_tokens
>
0
assert
response
.
usage
.
total_tokens
>
0
def
prepare_video_messages
(
self
,
video_path
):
def
prepare_video_
images_
messages
(
self
,
video_path
):
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the size of the video embeds differs from the `modality` argument when preprocessed
# the size of the video embeds differs from the `modality` argument when preprocessed
...
@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
...
@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
# from transformers import AutoTokenizer
# from transformers import AutoTokenizer
from
decord
import
VideoReader
,
cpu
from
decord
import
VideoReader
,
cpu
max_frames_num
=
2
0
max_frames_num
=
1
0
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
total_frame_num
=
len
(
vr
)
total_frame_num
=
len
(
vr
)
uniform_sampled_frames
=
np
.
linspace
(
uniform_sampled_frames
=
np
.
linspace
(
...
@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
...
@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
frame_format
=
{
frame_format
=
{
"type"
:
"image_url"
,
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"data:image/jpeg;base64,{}"
},
"image_url"
:
{
"url"
:
"data:image/jpeg;base64,{}"
},
"modalities"
:
"
video
"
,
"modalities"
:
"
image
"
,
}
}
for
base64_frame
in
base64_frames
:
for
base64_frame
in
base64_frames
:
...
@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):
...
@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):
return
messages
return
messages
def
prepare_video_messages
_video_direct
(
self
,
video_path
):
def
prepare_video_messages
(
self
,
video_path
):
messages
=
[
messages
=
[
{
{
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
[
"content"
:
[
{
{
"type"
:
"image_url"
,
"type"
:
"video_url"
,
"image_url"
:
{
"url"
:
f
"video:
{
video_path
}
"
},
"video_url"
:
{
"url"
:
f
"
{
video_path
}
"
},
"modalities"
:
"video"
,
},
},
{
"type"
:
"text"
,
"text"
:
"Please describe the video in detail."
},
{
"type"
:
"text"
,
"text"
:
"Please describe the video in detail."
},
],
],
...
@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
...
@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
f
.
write
(
response
.
content
)
f
.
write
(
response
.
content
)
return
file_path
return
file_path
def
test_video_chat_completion
(
self
):
# this test samples frames of video as input, but not video directly
def
test_video_images_chat_completion
(
self
):
url
=
VIDEO_JOBS_URL
file_path
=
self
.
get_or_download_file
(
url
)
client
=
openai
.
Client
(
api_key
=
self
.
api_key
,
base_url
=
self
.
base_url
)
messages
=
self
.
prepare_video_images_messages
(
file_path
)
response
=
client
.
chat
.
completions
.
create
(
model
=
"default"
,
messages
=
messages
,
temperature
=
0
,
max_tokens
=
1024
,
stream
=
False
,
)
video_response
=
response
.
choices
[
0
].
message
.
content
print
(
"-"
*
30
)
print
(
f
"Video images response:
\n
{
video_response
}
"
)
print
(
"-"
*
30
)
# Add assertions to validate the video response
assert
(
"iPod"
in
video_response
or
"device"
in
video_response
or
"microphone"
in
video_response
),
video_response
assert
(
"man"
in
video_response
or
"person"
in
video_response
or
"individual"
in
video_response
or
"speaker"
in
video_response
),
video_response
assert
(
"present"
in
video_response
or
"examine"
in
video_response
or
"display"
in
video_response
or
"hold"
in
video_response
)
assert
"black"
in
video_response
or
"dark"
in
video_response
self
.
assertIsNotNone
(
video_response
)
self
.
assertGreater
(
len
(
video_response
),
0
)
def
_test_video_chat_completion
(
self
):
url
=
VIDEO_JOBS_URL
url
=
VIDEO_JOBS_URL
file_path
=
self
.
get_or_download_file
(
url
)
file_path
=
self
.
get_or_download_file
(
url
)
client
=
openai
.
Client
(
api_key
=
self
.
api_key
,
base_url
=
self
.
base_url
)
client
=
openai
.
Client
(
api_key
=
self
.
api_key
,
base_url
=
self
.
base_url
)
# messages = self.prepare_video_messages_video_direct(file_path)
messages
=
self
.
prepare_video_messages
(
file_path
)
messages
=
self
.
prepare_video_messages
(
file_path
)
response
=
client
.
chat
.
completions
.
create
(
response
=
client
.
chat
.
completions
.
create
(
...
@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):
...
@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):
# Add assertions to validate the video response
# Add assertions to validate the video response
assert
(
assert
(
"iPod"
in
video_response
or
"device"
in
video_response
"iPod"
in
video_response
or
"device"
in
video_response
or
"microphone"
in
video_response
),
f
"video_response:
{
video_response
}
, should contain 'iPod' or 'device'"
),
f
"video_response:
{
video_response
}
, should contain 'iPod' or 'device'"
assert
(
assert
(
"man"
in
video_response
"man"
in
video_response
...
...
test/srt/test_vlm_accuracy.py
View file @
b5e3d603
...
@@ -10,15 +10,8 @@ import requests
...
@@ -10,15 +10,8 @@ import requests
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
PIL
import
Image
from
PIL
import
Image
from
transformers
import
(
from
transformers
import
AutoModel
,
AutoProcessor
,
AutoTokenizer
AutoModel
,
AutoProcessor
,
AutoTokenizer
,
Gemma3ForConditionalGeneration
,
Qwen2_5_VLForConditionalGeneration
,
)
from
sglang
import
Engine
from
sglang.srt.configs.model_config
import
ModelConfig
from
sglang.srt.configs.model_config
import
ModelConfig
from
sglang.srt.conversation
import
generate_chat_conv
from
sglang.srt.conversation
import
generate_chat_conv
from
sglang.srt.entrypoints.openai.protocol
import
ChatCompletionRequest
from
sglang.srt.entrypoints.openai.protocol
import
ChatCompletionRequest
...
@@ -169,107 +162,107 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
...
@@ -169,107 +162,107 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
# TODO: MiniCPMV is not compatible with transformers==4.52.3, temporarily disabled
# TODO: MiniCPMV is not compatible with transformers==4.52.3, temporarily disabled
#
class TestMiniCPMVLogits(VisionLLMLogitsBase):
class
TestMiniCPMVLogits
(
VisionLLMLogitsBase
):
#
@classmethod
@
classmethod
#
def setUpClass(cls):
def
setUpClass
(
cls
):
#
super().setUpClass()
super
().
setUpClass
()
#
cls.model_path = "openbmb/MiniCPM-V-2_6"
cls
.
model_path
=
"openbmb/MiniCPM-V-2_6"
#
cls.tokenizer = AutoTokenizer.from_pretrained(
cls
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
#
cls.model_path, trust_remote_code=True
cls
.
model_path
,
trust_remote_code
=
True
#
)
)
#
cls.processor = AutoProcessor.from_pretrained(
cls
.
processor
=
AutoProcessor
.
from_pretrained
(
#
cls.model_path, trust_remote_code=True
cls
.
model_path
,
trust_remote_code
=
True
#
)
)
#
cls.chat_template = "minicpmv"
cls
.
chat_template
=
"minicpmv"
#
#
cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cls
.
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
#
cls.hf_model = (
cls
.
hf_model
=
(
#
AutoModel.from_pretrained(
AutoModel
.
from_pretrained
(
#
cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
cls
.
model_path
,
torch_dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
#
)
)
#
.eval()
.
eval
()
#
.to(cls.device)
.
to
(
cls
.
device
)
#
)
)
#
init_embedding_cache(
0
)
init_embedding_cache
()
#
#
async def test_vlm_embedding_output(self):
async
def
test_vlm_embedding_output
(
self
):
#
"""
"""
#
Compares the embedding output of vlm
Compares the embedding output of vlm
#
"""
"""
#
inputs = self.get_processor_output()
inputs
=
self
.
get_processor_output
()
#
#
with torch.no_grad():
with
torch
.
no_grad
():
#
# hf
# hf
#
model_inputs = {
model_inputs
=
{
#
"input_ids": inputs.input_ids,
"input_ids"
:
inputs
.
input_ids
,
#
"image_bound": inputs.image_bound,
"image_bound"
:
inputs
.
image_bound
,
#
"pixel_values": inputs.pixel_values,
"pixel_values"
:
inputs
.
pixel_values
,
#
"tgt_sizes": inputs.tgt_sizes,
"tgt_sizes"
:
inputs
.
tgt_sizes
,
#
}
}
#
(hf_output, _) = self.hf_model.get_vllm_embedding(
(
hf_output
,
_
)
=
self
.
hf_model
.
get_vllm_embedding
(
#
model_inputs,
model_inputs
,
#
)
)
#
hf_output = hf_output.squeeze(0)
hf_output
=
hf_output
.
squeeze
(
0
)
#
#
# sglang
# sglang
#
model = self.get_sglang_model()
model
=
self
.
get_sglang_model
()
#
input_ids = inputs["input_ids"].to(self.device).flatten()
input_ids
=
inputs
[
"input_ids"
].
to
(
self
.
device
).
flatten
()
#
#
pixel_values = inputs["pixel_values"]
pixel_values
=
inputs
[
"pixel_values"
]
#
tgt_sizes = inputs["tgt_sizes"]
tgt_sizes
=
inputs
[
"tgt_sizes"
]
#
pixel_values_flat: List[torch.Tensor] = []
pixel_values_flat
:
List
[
torch
.
Tensor
]
=
[]
#
tgt_sizes_flat: List[torch.Tensor] = []
tgt_sizes_flat
:
List
[
torch
.
Tensor
]
=
[]
#
for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
for
pixel_b
,
tgt_b
in
zip
(
pixel_values
,
tgt_sizes
):
#
# per image
# per image
#
if len(pixel_b) != len(tgt_b):
if
len
(
pixel_b
)
!=
len
(
tgt_b
):
#
raise ValueError(
raise
ValueError
(
#
"Inconsistent N lengths, found: "
"Inconsistent N lengths, found: "
#
f"{len(pixel_b)} vs {len(tgt_b)}"
f
"
{
len
(
pixel_b
)
}
vs
{
len
(
tgt_b
)
}
"
#
)
)
#
for pixel_n, tgt_n in zip(pixel_b, tgt_b):
for
pixel_n
,
tgt_n
in
zip
(
pixel_b
,
tgt_b
):
#
pixel_values_flat += [pixel_n]
pixel_values_flat
+=
[
pixel_n
]
#
tgt_sizes_flat += [tgt_n]
tgt_sizes_flat
+=
[
tgt_n
]
#
#
im_start_id, im_end_id = (
im_start_id
,
im_end_id
=
(
#
self.tokenizer.im_start_id,
self
.
tokenizer
.
im_start_id
,
#
self.tokenizer.im_end_id,
self
.
tokenizer
.
im_end_id
,
#
)
)
#
slice_start_id, slice_end_id = (
slice_start_id
,
slice_end_id
=
(
#
self.tokenizer.slice_start_id,
self
.
tokenizer
.
slice_start_id
,
#
self.tokenizer.slice_end_id,
self
.
tokenizer
.
slice_end_id
,
#
)
)
#
#
image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
image_offsets
=
BaseMultimodalProcessor
.
get_mm_items_offset_by_pair
(
#
input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
input_ids
=
input_ids
,
mm_start_id
=
im_start_id
,
mm_end_id
=
im_end_id
#
)
)
#
slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
slice_offsets
=
BaseMultimodalProcessor
.
get_mm_items_offset_by_pair
(
#
input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
input_ids
=
input_ids
,
mm_start_id
=
slice_start_id
,
mm_end_id
=
slice_end_id
#
)
)
#
image_offsets.extend(slice_offsets)
image_offsets
.
extend
(
slice_offsets
)
#
image_offsets = sorted(image_offsets)
image_offsets
=
sorted
(
image_offsets
)
#
#
sglang_output = embed_mm_inputs(
sglang_output
=
embed_mm_inputs
(
#
mm_inputs_list=[
mm_inputs_list
=
[
#
MultimodalInputs(
MultimodalInputs
(
#
mm_items=[
mm_items
=
[
#
MultimodalDataItem(
MultimodalDataItem
(
#
pixel_values=pixel_values_flat,
pixel_values
=
pixel_values_flat
,
#
image_
offsets=image_offsets,
offsets
=
image_offsets
,
#
tgt_size=tgt_sizes_flat,
tgt_size
=
tgt_sizes_flat
,
#
modality=Modality.IMAGE,
modality
=
Modality
.
IMAGE
,
#
pad_value=self.processor.tokenizer.unk_token_id,
pad_value
=
self
.
processor
.
tokenizer
.
unk_token_id
,
#
)
)
#
]
]
#
),
),
#
],
],
#
extend_prefix_lens=[0],
extend_prefix_lens
=
[
0
],
#
extend_seq_lens=[input_ids.shape[0]],
extend_seq_lens
=
[
input_ids
.
shape
[
0
]],
#
input_ids=input_ids,
input_ids
=
input_ids
,
#
input_embedding=model.get_input_embeddings(),
input_embedding
=
model
.
get_input_embeddings
(),
#
image_data_embedding_func=model.get_image_feature
,
multimodal_model
=
model
,
#
placeholder_tokens={
placeholder_tokens
=
{
#
Modality.IMAGE: self.processor.tokenizer.unk_token_id,
Modality
.
IMAGE
:
self
.
processor
.
tokenizer
.
unk_token_id
,
#
},
},
#
)
)
#
#
self.compare_outputs(sglang_output, hf_output)
self
.
compare_outputs
(
sglang_output
,
hf_output
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment