Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
66d6be08
Unverified
Commit
66d6be08
authored
Aug 16, 2025
by
Binyao Jiang
Committed by
GitHub
Aug 16, 2025
Browse files
Bug fix: use correct mm_items in embed_mm_inputs (#8893)
parent
1c1f8a11
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
62 additions
and
1 deletion
+62
-1
python/sglang/srt/managers/mm_utils.py
python/sglang/srt/managers/mm_utils.py
+1
-1
test/srt/test_vision_openai_server_b.py
test/srt/test_vision_openai_server_b.py
+3
-0
test/srt/test_vision_openai_server_common.py
test/srt/test_vision_openai_server_common.py
+58
-0
No files found.
python/sglang/srt/managers/mm_utils.py
View file @
66d6be08
...
@@ -560,7 +560,7 @@ def embed_mm_inputs(
...
@@ -560,7 +560,7 @@ def embed_mm_inputs(
]
]
items_size
[
i
+
1
]
=
len
(
mm_items
)
items_size
[
i
+
1
]
=
len
(
mm_items
)
items_offsets
.
append
(
items_offsets
.
append
(
flatten_nested_list
([
item
.
offsets
for
item
in
mm_inputs
.
mm_items
])
flatten_nested_list
([
item
.
offsets
for
item
in
mm_items
])
)
)
items_size
=
torch
.
cumsum
(
items_size
,
dim
=
0
).
tolist
()
items_size
=
torch
.
cumsum
(
items_size
,
dim
=
0
).
tolist
()
...
...
test/srt/test_vision_openai_server_b.py
View file @
66d6be08
...
@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer):
...
@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer):
# This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
# This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
# self._test_audio_ambient_completion()
# self._test_audio_ambient_completion()
def
_test_mixed_image_audio_chat_completion
(
self
):
self
.
_test_mixed_image_audio_chat_completion
()
class
TestQwen2AudioServer
(
TestOpenAIVisionServer
):
class
TestQwen2AudioServer
(
TestOpenAIVisionServer
):
@
classmethod
@
classmethod
...
...
test/srt/test_vision_openai_server_common.py
View file @
66d6be08
...
@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase):
...
@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase):
assert
response
.
usage
.
completion_tokens
>
0
assert
response
.
usage
.
completion_tokens
>
0
assert
response
.
usage
.
total_tokens
>
0
assert
response
.
usage
.
total_tokens
>
0
def
_test_mixed_image_audio_chat_completion
(
self
):
client
=
openai
.
Client
(
api_key
=
self
.
api_key
,
base_url
=
self
.
base_url
)
response
=
client
.
chat
.
completions
.
create
(
model
=
"default"
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
IMAGE_MAN_IRONING_URL
},
},
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
AUDIO_TRUMP_SPEECH_URL
},
},
{
"type"
:
"text"
,
"text"
:
"Please describe the image in one sentence, and then write down the audio transcription in English."
,
},
],
},
],
temperature
=
0
,
**
(
self
.
get_vision_request_kwargs
()),
)
assert
response
.
choices
[
0
].
message
.
role
==
"assistant"
text
=
response
.
choices
[
0
].
message
.
content
assert
isinstance
(
text
,
str
)
print
(
"-"
*
30
)
print
(
f
"Mixed image & audio response:
\n
{
text
}
"
)
print
(
"-"
*
30
)
assert
(
"man"
in
text
or
"cab"
in
text
or
"SUV"
in
text
or
"taxi"
in
text
or
"car"
in
text
),
f
"text:
{
text
}
, should contain man, cab, SUV, taxi or car"
check_list
=
[
"thank you"
,
"it's a privilege to be here"
,
"leader"
,
"science"
,
"art"
,
]
for
check_word
in
check_list
:
assert
(
check_word
in
text
),
f
"text: |
{
text
}
| should contain |
{
check_word
}
|"
assert
response
.
id
assert
response
.
created
assert
response
.
usage
.
prompt_tokens
>
0
assert
response
.
usage
.
completion_tokens
>
0
assert
response
.
usage
.
total_tokens
>
0
def
prepare_video_images_messages
(
self
,
video_path
):
def
prepare_video_images_messages
(
self
,
video_path
):
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the size of the video embeds differs from the `modality` argument when preprocessed
# the size of the video embeds differs from the `modality` argument when preprocessed
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment