Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
b5e3d603
Unverified
Commit
b5e3d603
authored
Jul 10, 2025
by
Mick
Committed by
GitHub
Jul 09, 2025
Browse files
vlm: support video as an input modality (#5888)
parent
4ed57807
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
160 additions
and
122 deletions
+160
-122
test/srt/test_vision_openai_server_common.py
test/srt/test_vision_openai_server_common.py
+55
-10
test/srt/test_vlm_accuracy.py
test/srt/test_vlm_accuracy.py
+105
-112
No files found.
test/srt/test_vision_openai_server_common.py
View file @
b5e3d603
...
...
@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
assert
response
.
usage
.
completion_tokens
>
0
assert
response
.
usage
.
total_tokens
>
0
def
prepare_video_messages
(
self
,
video_path
):
def
prepare_video_
images_
messages
(
self
,
video_path
):
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the size of the video embeds differs from the `modality` argument when preprocessed
...
...
@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
# from transformers import AutoTokenizer
from
decord
import
VideoReader
,
cpu
max_frames_num
=
2
0
max_frames_num
=
1
0
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
total_frame_num
=
len
(
vr
)
uniform_sampled_frames
=
np
.
linspace
(
...
...
@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
frame_format
=
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"data:image/jpeg;base64,{}"
},
"modalities"
:
"
video
"
,
"modalities"
:
"
image
"
,
}
for
base64_frame
in
base64_frames
:
...
...
@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):
return
messages
def
prepare_video_messages
_video_direct
(
self
,
video_path
):
def
prepare_video_messages
(
self
,
video_path
):
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"video:
{
video_path
}
"
},
"modalities"
:
"video"
,
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
f
"
{
video_path
}
"
},
},
{
"type"
:
"text"
,
"text"
:
"Please describe the video in detail."
},
],
...
...
@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
f
.
write
(
response
.
content
)
return
file_path
def
test_video_chat_completion
(
self
):
# this test samples frames of video as input, but not video directly
def
test_video_images_chat_completion
(
self
):
url
=
VIDEO_JOBS_URL
file_path
=
self
.
get_or_download_file
(
url
)
client
=
openai
.
Client
(
api_key
=
self
.
api_key
,
base_url
=
self
.
base_url
)
messages
=
self
.
prepare_video_images_messages
(
file_path
)
response
=
client
.
chat
.
completions
.
create
(
model
=
"default"
,
messages
=
messages
,
temperature
=
0
,
max_tokens
=
1024
,
stream
=
False
,
)
video_response
=
response
.
choices
[
0
].
message
.
content
print
(
"-"
*
30
)
print
(
f
"Video images response:
\n
{
video_response
}
"
)
print
(
"-"
*
30
)
# Add assertions to validate the video response
assert
(
"iPod"
in
video_response
or
"device"
in
video_response
or
"microphone"
in
video_response
),
video_response
assert
(
"man"
in
video_response
or
"person"
in
video_response
or
"individual"
in
video_response
or
"speaker"
in
video_response
),
video_response
assert
(
"present"
in
video_response
or
"examine"
in
video_response
or
"display"
in
video_response
or
"hold"
in
video_response
)
assert
"black"
in
video_response
or
"dark"
in
video_response
self
.
assertIsNotNone
(
video_response
)
self
.
assertGreater
(
len
(
video_response
),
0
)
def
_test_video_chat_completion
(
self
):
url
=
VIDEO_JOBS_URL
file_path
=
self
.
get_or_download_file
(
url
)
client
=
openai
.
Client
(
api_key
=
self
.
api_key
,
base_url
=
self
.
base_url
)
# messages = self.prepare_video_messages_video_direct(file_path)
messages
=
self
.
prepare_video_messages
(
file_path
)
response
=
client
.
chat
.
completions
.
create
(
...
...
@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):
# Add assertions to validate the video response
assert
(
"iPod"
in
video_response
or
"device"
in
video_response
"iPod"
in
video_response
or
"device"
in
video_response
or
"microphone"
in
video_response
),
f
"video_response:
{
video_response
}
, should contain 'iPod' or 'device'"
assert
(
"man"
in
video_response
...
...
test/srt/test_vlm_accuracy.py
View file @
b5e3d603
...
...
@@ -10,15 +10,8 @@ import requests
import
torch
import
torch.nn.functional
as
F
from
PIL
import
Image
from
transformers
import
(
AutoModel
,
AutoProcessor
,
AutoTokenizer
,
Gemma3ForConditionalGeneration
,
Qwen2_5_VLForConditionalGeneration
,
)
from
transformers
import
AutoModel
,
AutoProcessor
,
AutoTokenizer
from
sglang
import
Engine
from
sglang.srt.configs.model_config
import
ModelConfig
from
sglang.srt.conversation
import
generate_chat_conv
from
sglang.srt.entrypoints.openai.protocol
import
ChatCompletionRequest
...
...
@@ -169,107 +162,107 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
# TODO: MiniCPMV is not compatible with transformers==4.52.3, temporarily disabled
#
class TestMiniCPMVLogits(VisionLLMLogitsBase):
#
@classmethod
#
def setUpClass(cls):
#
super().setUpClass()
#
cls.model_path = "openbmb/MiniCPM-V-2_6"
#
cls.tokenizer = AutoTokenizer.from_pretrained(
#
cls.model_path, trust_remote_code=True
#
)
#
cls.processor = AutoProcessor.from_pretrained(
#
cls.model_path, trust_remote_code=True
#
)
#
cls.chat_template = "minicpmv"
#
#
cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#
cls.hf_model = (
#
AutoModel.from_pretrained(
#
cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
#
)
#
.eval()
#
.to(cls.device)
#
)
#
init_embedding_cache(
0
)
#
#
async def test_vlm_embedding_output(self):
#
"""
#
Compares the embedding output of vlm
#
"""
#
inputs = self.get_processor_output()
#
#
with torch.no_grad():
#
# hf
#
model_inputs = {
#
"input_ids": inputs.input_ids,
#
"image_bound": inputs.image_bound,
#
"pixel_values": inputs.pixel_values,
#
"tgt_sizes": inputs.tgt_sizes,
#
}
#
(hf_output, _) = self.hf_model.get_vllm_embedding(
#
model_inputs,
#
)
#
hf_output = hf_output.squeeze(0)
#
#
# sglang
#
model = self.get_sglang_model()
#
input_ids = inputs["input_ids"].to(self.device).flatten()
#
#
pixel_values = inputs["pixel_values"]
#
tgt_sizes = inputs["tgt_sizes"]
#
pixel_values_flat: List[torch.Tensor] = []
#
tgt_sizes_flat: List[torch.Tensor] = []
#
for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
#
# per image
#
if len(pixel_b) != len(tgt_b):
#
raise ValueError(
#
"Inconsistent N lengths, found: "
#
f"{len(pixel_b)} vs {len(tgt_b)}"
#
)
#
for pixel_n, tgt_n in zip(pixel_b, tgt_b):
#
pixel_values_flat += [pixel_n]
#
tgt_sizes_flat += [tgt_n]
#
#
im_start_id, im_end_id = (
#
self.tokenizer.im_start_id,
#
self.tokenizer.im_end_id,
#
)
#
slice_start_id, slice_end_id = (
#
self.tokenizer.slice_start_id,
#
self.tokenizer.slice_end_id,
#
)
#
#
image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
#
input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
#
)
#
slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
#
input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
#
)
#
image_offsets.extend(slice_offsets)
#
image_offsets = sorted(image_offsets)
#
#
sglang_output = embed_mm_inputs(
#
mm_inputs_list=[
#
MultimodalInputs(
#
mm_items=[
#
MultimodalDataItem(
#
pixel_values=pixel_values_flat,
#
image_
offsets=image_offsets,
#
tgt_size=tgt_sizes_flat,
#
modality=Modality.IMAGE,
#
pad_value=self.processor.tokenizer.unk_token_id,
#
)
#
]
#
),
#
],
#
extend_prefix_lens=[0],
#
extend_seq_lens=[input_ids.shape[0]],
#
input_ids=input_ids,
#
input_embedding=model.get_input_embeddings(),
#
image_data_embedding_func=model.get_image_feature
,
#
placeholder_tokens={
#
Modality.IMAGE: self.processor.tokenizer.unk_token_id,
#
},
#
)
#
#
self.compare_outputs(sglang_output, hf_output)
class
TestMiniCPMVLogits
(
VisionLLMLogitsBase
):
@
classmethod
def
setUpClass
(
cls
):
super
().
setUpClass
()
cls
.
model_path
=
"openbmb/MiniCPM-V-2_6"
cls
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
cls
.
model_path
,
trust_remote_code
=
True
)
cls
.
processor
=
AutoProcessor
.
from_pretrained
(
cls
.
model_path
,
trust_remote_code
=
True
)
cls
.
chat_template
=
"minicpmv"
cls
.
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
cls
.
hf_model
=
(
AutoModel
.
from_pretrained
(
cls
.
model_path
,
torch_dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
)
.
eval
()
.
to
(
cls
.
device
)
)
init_embedding_cache
()
async
def
test_vlm_embedding_output
(
self
):
"""
Compares the embedding output of vlm
"""
inputs
=
self
.
get_processor_output
()
with
torch
.
no_grad
():
# hf
model_inputs
=
{
"input_ids"
:
inputs
.
input_ids
,
"image_bound"
:
inputs
.
image_bound
,
"pixel_values"
:
inputs
.
pixel_values
,
"tgt_sizes"
:
inputs
.
tgt_sizes
,
}
(
hf_output
,
_
)
=
self
.
hf_model
.
get_vllm_embedding
(
model_inputs
,
)
hf_output
=
hf_output
.
squeeze
(
0
)
# sglang
model
=
self
.
get_sglang_model
()
input_ids
=
inputs
[
"input_ids"
].
to
(
self
.
device
).
flatten
()
pixel_values
=
inputs
[
"pixel_values"
]
tgt_sizes
=
inputs
[
"tgt_sizes"
]
pixel_values_flat
:
List
[
torch
.
Tensor
]
=
[]
tgt_sizes_flat
:
List
[
torch
.
Tensor
]
=
[]
for
pixel_b
,
tgt_b
in
zip
(
pixel_values
,
tgt_sizes
):
# per image
if
len
(
pixel_b
)
!=
len
(
tgt_b
):
raise
ValueError
(
"Inconsistent N lengths, found: "
f
"
{
len
(
pixel_b
)
}
vs
{
len
(
tgt_b
)
}
"
)
for
pixel_n
,
tgt_n
in
zip
(
pixel_b
,
tgt_b
):
pixel_values_flat
+=
[
pixel_n
]
tgt_sizes_flat
+=
[
tgt_n
]
im_start_id
,
im_end_id
=
(
self
.
tokenizer
.
im_start_id
,
self
.
tokenizer
.
im_end_id
,
)
slice_start_id
,
slice_end_id
=
(
self
.
tokenizer
.
slice_start_id
,
self
.
tokenizer
.
slice_end_id
,
)
image_offsets
=
BaseMultimodalProcessor
.
get_mm_items_offset_by_pair
(
input_ids
=
input_ids
,
mm_start_id
=
im_start_id
,
mm_end_id
=
im_end_id
)
slice_offsets
=
BaseMultimodalProcessor
.
get_mm_items_offset_by_pair
(
input_ids
=
input_ids
,
mm_start_id
=
slice_start_id
,
mm_end_id
=
slice_end_id
)
image_offsets
.
extend
(
slice_offsets
)
image_offsets
=
sorted
(
image_offsets
)
sglang_output
=
embed_mm_inputs
(
mm_inputs_list
=
[
MultimodalInputs
(
mm_items
=
[
MultimodalDataItem
(
pixel_values
=
pixel_values_flat
,
offsets
=
image_offsets
,
tgt_size
=
tgt_sizes_flat
,
modality
=
Modality
.
IMAGE
,
pad_value
=
self
.
processor
.
tokenizer
.
unk_token_id
,
)
]
),
],
extend_prefix_lens
=
[
0
],
extend_seq_lens
=
[
input_ids
.
shape
[
0
]],
input_ids
=
input_ids
,
input_embedding
=
model
.
get_input_embeddings
(),
multimodal_model
=
model
,
placeholder_tokens
=
{
Modality
.
IMAGE
:
self
.
processor
.
tokenizer
.
unk_token_id
,
},
)
self
.
compare_outputs
(
sglang_output
,
hf_output
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment