Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c998d04b
Unverified
Commit
c998d04b
authored
Apr 24, 2025
by
Mick
Committed by
GitHub
Apr 23, 2025
Browse files
vlm: enable radix cache for qwen-vl models (#5349)
Co-authored-by:
Xinyuan Tong
<
justinning0323@outlook.com
>
parent
7d0edf3c
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
22 additions
and
36 deletions
+22
-36
python/sglang/srt/openai_api/adapter.py
python/sglang/srt/openai_api/adapter.py
+6
-3
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+4
-15
python/sglang/test/runners.py
python/sglang/test/runners.py
+6
-13
test/srt/run_suite.py
test/srt/run_suite.py
+1
-1
test/srt/test_vision_openai_server.py
test/srt/test_vision_openai_server.py
+2
-1
test/srt/test_vlm_accuracy.py
test/srt/test_vlm_accuracy.py
+3
-3
No files found.
python/sglang/srt/openai_api/adapter.py
View file @
c998d04b
...
...
@@ -909,6 +909,7 @@ def v1_chat_generate_request(
# NOTE: with openai API, the prompt's logprobs are always not computed
is_multimodal
=
tokenizer_manager
.
model_config
.
is_multimodal
for
request
in
all_requests
:
# Prep the data needed for the underlying GenerateReqInput:
# - prompt: The full prompt string.
...
...
@@ -918,6 +919,7 @@ def v1_chat_generate_request(
# None skips any image processing in GenerateReqInput.
strict_tag
=
None
prompt
=
""
prompt_ids
=
[]
if
not
isinstance
(
request
.
messages
,
str
):
# Apply chat template and its stop strings.
tools
=
None
...
...
@@ -1019,7 +1021,7 @@ def v1_chat_generate_request(
):
encoded
=
encoded
[
1
:]
prompt_ids
+=
encoded
if
tokenizer_manager
.
model_config
.
is_multimodal
:
if
is_multimodal
:
prompt
=
tokenizer_manager
.
tokenizer
.
decode
(
prompt_ids
)
stop
=
request
.
stop
image_data
=
None
...
...
@@ -1064,8 +1066,9 @@ def v1_chat_generate_request(
stop
.
append
(
request
.
stop
)
else
:
stop
.
extend
(
request
.
stop
)
prompt_ids
=
tokenizer_manager
.
tokenizer
.
encode
(
prompt
)
if
not
is_multimodal
:
prompt_ids
=
tokenizer_manager
.
tokenizer
.
encode
(
prompt
)
else
:
# Use the raw prompt and stop strings if the messages is already a string.
prompt_ids
=
request
.
messages
...
...
@@ -1135,7 +1138,7 @@ def v1_chat_generate_request(
audio_data_list
.
append
(
audio_data
)
modalities_list
.
append
(
modalities
)
if
len
(
all_requests
)
==
1
:
if
tokenizer_manager
.
model_config
.
is_multimodal
:
if
is_multimodal
:
# processor will need text input
prompt_kwargs
=
{
"text"
:
prompts
[
0
]}
else
:
...
...
python/sglang/srt/server_args.py
View file @
c998d04b
...
...
@@ -153,8 +153,7 @@ class ServerArgs:
enable_nccl_nvls
:
bool
=
False
disable_outlines_disk_cache
:
bool
=
False
disable_custom_all_reduce
:
bool
=
False
enable_llama4_multimodal
:
Optional
[
bool
]
=
None
enable_gemma3_multimodal
:
Optional
[
bool
]
=
None
enable_multimodal
:
Optional
[
bool
]
=
None
disable_overlap_schedule
:
bool
=
False
enable_mixed_chunk
:
bool
=
False
enable_dp_attention
:
bool
=
False
...
...
@@ -286,10 +285,6 @@ class ServerArgs:
if
self
.
grammar_backend
is
None
:
self
.
grammar_backend
=
"xgrammar"
self
.
enable_multimodal
:
Optional
[
bool
]
=
(
self
.
enable_llama4_multimodal
or
self
.
enable_gemma3_multimodal
)
# Data parallelism attention
if
self
.
enable_dp_attention
:
self
.
schedule_conservativeness
=
self
.
schedule_conservativeness
*
0.3
...
...
@@ -982,16 +977,10 @@ class ServerArgs:
help
=
"Disable the custom all-reduce kernel and fall back to NCCL."
,
)
parser
.
add_argument
(
"--enable-llama4-multimodal"
,
default
=
ServerArgs
.
enable_llama4_multimodal
,
action
=
"store_true"
,
help
=
"Enable the multimodal functionality for Llama-4."
,
)
parser
.
add_argument
(
"--enable-gemma3-multimodal"
,
default
=
ServerArgs
.
enable_gemma3_multimodal
,
"--enable-multimodal"
,
default
=
ServerArgs
.
enable_multimodal
,
action
=
"store_true"
,
help
=
"Enable the multimodal functionality for
Gemma-3.
"
,
help
=
"Enable the multimodal functionality for
the served model. If the model being served is not multimodal, nothing will happen
"
,
)
parser
.
add_argument
(
"--disable-overlap-schedule"
,
...
...
python/sglang/test/runners.py
View file @
c998d04b
...
...
@@ -190,25 +190,18 @@ class HFRunner:
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
.
to
(
inputs_embeds
.
device
)
outputs
=
self
.
model
.
model
(
input_ids
=
None
,
outputs
=
self
.
model
(
input_ids
=
input_ids
,
position_ids
=
position_ids
,
attention_mask
=
attention_mask
,
past_key_values
=
past_key_values
,
output_hidden_states
=
True
,
return_dict
=
True
,
inputs_embeds
=
inputs_embeds
,
image_grid_thw
=
image_grid_thw
,
)
pooling_mask
=
attention_mask
if
pooling_mask
is
None
else
pooling_mask
left_padding
=
pooling_mask
[:,
-
1
].
sum
()
==
pooling_mask
.
shape
[
0
]
# TODO
if
left_padding
:
embeddings
=
outputs
.
last_hidden_state
[:,
-
1
]
else
:
sequence_lengths
=
pooling_mask
.
sum
(
dim
=
1
)
-
1
batch_size
=
outputs
.
last_hidden_state
.
shape
[
0
]
embeddings
=
outputs
.
last_hidden_state
[
torch
.
arange
(
batch_size
,
device
=
outputs
.
last_hidden_state
.
device
),
sequence_lengths
,
]
embeddings
=
outputs
.
hidden_states
[
-
1
][:,
-
1
]
embeddings
=
torch
.
nn
.
functional
.
normalize
(
embeddings
,
p
=
2
,
dim
=
1
)
return
embeddings
.
contiguous
()
...
...
test/srt/run_suite.py
View file @
c998d04b
...
...
@@ -45,7 +45,7 @@ suites = {
TestFile
(
"test_mla_fp8.py"
,
93
),
TestFile
(
"test_no_chunked_prefill.py"
,
126
),
TestFile
(
"test_no_overlap_scheduler.py"
,
262
),
TestFile
(
"test_openai_server.py"
,
1
24
),
TestFile
(
"test_openai_server.py"
,
1
86
),
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_page_size.py"
,
60
),
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
...
...
test/srt/test_vision_openai_server.py
View file @
c998d04b
...
...
@@ -307,6 +307,7 @@ class TestOpenAIVisionServer(CustomTestCase):
self
.
assertGreater
(
len
(
video_response
),
0
)
def
test_regex
(
self
):
return
client
=
openai
.
Client
(
api_key
=
self
.
api_key
,
base_url
=
self
.
base_url
)
regex
=
(
...
...
@@ -724,7 +725,7 @@ class TestGemma3itServer(TestOpenAIVisionServer):
"gemma-it"
,
"--mem-fraction-static"
,
"0.75"
,
"--enable-
gemma3-
multimodal"
,
"--enable-multimodal"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
test/srt/test_vlm_accuracy.py
View file @
c998d04b
...
...
@@ -229,9 +229,9 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
input_ids
=
input_ids
,
input_embedding
=
model
.
get_input_embeddings
(),
image_data_embedding_func
=
model
.
get_image_feature
,
placeholder_token
_id
s
=
[
self
.
processor
.
tokenizer
.
unk_token_id
,
]
,
placeholder_tokens
=
{
Modality
.
IMAGE
:
self
.
processor
.
tokenizer
.
unk_token_id
,
}
,
)
self
.
compare_outputs
(
sglang_output
,
hf_output
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment