Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
3964b352
Unverified
Commit
3964b352
authored
Jul 19, 2025
by
Mick
Committed by
GitHub
Jul 18, 2025
Browse files
chore: tune mem fraction static for vlm (#6881)
parent
9c7a4618
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
57 additions
and
13 deletions
+57
-13
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+2
-2
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+46
-2
test/srt/test_vision_openai_server_a.py
test/srt/test_vision_openai_server_a.py
+5
-5
test/srt/test_vision_openai_server_b.py
test/srt/test_vision_openai_server_b.py
+4
-4
No files found.
python/sglang/srt/model_executor/model_runner.py
View file @
3964b352
...
...
@@ -411,7 +411,7 @@ class ModelRunner:
else
:
server_args
.
attention_backend
=
"triton"
logger
.
info
(
f
"Attention backend not
set
. Use
{
server_args
.
attention_backend
}
backend by default."
f
"Attention backend not
explicitly specified
. Use
{
server_args
.
attention_backend
}
backend by default."
)
elif
self
.
use_mla_backend
:
if
server_args
.
device
!=
"cpu"
:
...
...
@@ -463,7 +463,7 @@ class ModelRunner:
if
not
self
.
is_multimodal_chunked_prefill_supported
:
server_args
.
chunked_prefill_size
=
-
1
logger
.
info
(
f
"Automatically turn of --chunked-prefill-size as it is not supported for "
f
"Automatically turn of
f
--chunked-prefill-size as it is not supported for "
f
"
{
self
.
model_config
.
hf_config
.
model_type
}
"
)
...
...
python/sglang/srt/server_args.py
View file @
3964b352
...
...
@@ -337,8 +337,52 @@ class ServerArgs:
# Multimodal models need more memory for the image processor
model_config
=
ModelConfig
.
from_server_args
(
self
)
if
model_config
.
is_multimodal
:
self
.
mem_fraction_static
*=
0.90
vision_config
=
getattr
(
model_config
.
hf_config
,
"vision_config"
,
None
)
if
model_config
.
is_multimodal
and
vision_config
:
# roughly reduce the mem_fraction_static base on params of Vit
original_server_arg_mem_fraction
=
self
.
mem_fraction_static
# a base mem_fraction_static factor for regular Vit
base_mem_fraction_reduction_ratio
=
0.95
vit_num_layers
=
getattr
(
vision_config
,
"num_hidden_layers"
,
24
)
vit_hidden_size
=
getattr
(
vision_config
,
"hidden_size"
,
1024
)
# baseline ViT params (ViT-L/14)
baseline_vit_layers
=
24
baseline_vit_hidden_size
=
1024
# weight params count
current_complexity_score
=
vit_num_layers
*
(
vit_hidden_size
**
2
)
baseline_complexity_score
=
baseline_vit_layers
*
(
baseline_vit_hidden_size
**
2
)
complexity_ratio
=
(
current_complexity_score
/
baseline_complexity_score
if
baseline_complexity_score
>
0
else
1.0
)
# every time the complexity grows 100%, adjust final factor for 10%
sensitivity_scale
=
0.1
dynamic_adjustment_factor
=
1.0
-
sensitivity_scale
*
(
complexity_ratio
-
1.0
)
dynamic_adjustment_factor
=
max
(
0.8
,
min
(
1.05
,
dynamic_adjustment_factor
)
)
final_overall_factor
=
(
base_mem_fraction_reduction_ratio
*
dynamic_adjustment_factor
)
self
.
mem_fraction_static
=
(
original_server_arg_mem_fraction
*
final_overall_factor
)
logger
.
warning
(
f
"Multimodal model: Dynamically adjusted --mem-fraction-static "
f
"from:
{
original_server_arg_mem_fraction
:.
3
f
}
to:
{
self
.
mem_fraction_static
:.
3
f
}
."
)
# Set chunked prefill size, which depends on the gpu memory capacity
if
self
.
chunked_prefill_size
is
None
:
...
...
test/srt/test_vision_openai_server_a.py
View file @
3964b352
...
...
@@ -30,7 +30,7 @@ class TestQwen2VLServer(TestOpenAIVisionServer):
api_key
=
cls
.
api_key
,
other_args
=
[
"--mem-fraction-static"
,
"0.
4
"
,
"0.
35
"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
@@ -52,7 +52,7 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer):
api_key
=
cls
.
api_key
,
other_args
=
[
"--mem-fraction-static"
,
"0.
4
"
,
"0.
35
"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
@@ -75,7 +75,7 @@ class TestVLMContextLengthIssue(CustomTestCase):
other_args
=
[
"--context-length"
,
"300"
,
"--mem-fraction-static=0.
80
"
,
"--mem-fraction-static=0.
75
"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
@@ -147,7 +147,7 @@ class TestMinicpmvServer(TestOpenAIVisionServer):
other_args
=
[
"--trust-remote-code"
,
"--mem-fraction-static"
,
"0.
4
"
,
"0.
35
"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
@@ -181,7 +181,7 @@ class TestMinicpmoServer(TestOpenAIVisionServer):
other_args
=
[
"--trust-remote-code"
,
"--mem-fraction-static"
,
"0.
7
"
,
"0.
65
"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
test/srt/test_vision_openai_server_b.py
View file @
3964b352
...
...
@@ -22,7 +22,7 @@ class TestPixtralServer(TestOpenAIVisionServer):
other_args
=
[
"--trust-remote-code"
,
"--mem-fraction-static"
,
"0.7
3
"
,
"0.7
0
"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
@@ -44,7 +44,7 @@ class TestMistral3_1Server(TestOpenAIVisionServer):
other_args
=
[
"--trust-remote-code"
,
"--mem-fraction-static"
,
"0.
8
"
,
"0.
75
"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
@@ -88,7 +88,7 @@ class TestJanusProServer(TestOpenAIVisionServer):
other_args
=
[
"--trust-remote-code"
,
"--mem-fraction-static"
,
"0.
4
"
,
"0.
35
"
,
],
)
cls
.
base_url
+=
"/v1"
...
...
@@ -197,7 +197,7 @@ class TestPhi4MMServer(TestOpenAIVisionServer):
other_args
=
[
"--trust-remote-code"
,
"--mem-fraction-static"
,
"0.7
5
"
,
"0.7
0
"
,
"--disable-radix-cache"
,
"--max-loras-per-batch"
,
"1"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment