Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
e4b6133b
Unverified
Commit
e4b6133b
authored
Apr 30, 2025
by
JieXin Liang
Committed by
GitHub
Apr 29, 2025
Browse files
[fix] relax mem_fraction_static for h200 (#5893)
Co-authored-by:
alcanerian
<
alcanerian@gmail.com
>
parent
dd408ee4
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
17 additions
and
14 deletions
+17
-14
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+1
-1
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+16
-13
No files found.
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
e4b6133b
...
...
@@ -135,7 +135,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
gpu_mem
=
get_device_memory_capacity
()
# Batch size of each rank will not become so large when DP is on
if
gpu_mem
is
not
None
and
gpu_mem
>
81920
and
server_args
.
dp_size
==
1
:
if
gpu_mem
is
not
None
and
gpu_mem
>
96
*
1024
:
capture_bs
+=
list
(
range
(
160
,
257
,
8
))
if
max
(
capture_bs
)
>
model_runner
.
req_to_token_pool
.
size
:
...
...
python/sglang/srt/server_args.py
View file @
e4b6133b
...
...
@@ -222,20 +222,23 @@ class ServerArgs:
# Set mem fraction static, which depends on the tensor parallelism size
if
self
.
mem_fraction_static
is
None
:
if
gpu_mem
<=
81920
:
if
self
.
tp_size
>=
16
:
self
.
mem_fraction_static
=
0.79
elif
self
.
tp_size
>=
8
:
self
.
mem_fraction_static
=
0.81
elif
self
.
tp_size
>=
4
:
self
.
mem_fraction_static
=
0.85
elif
self
.
tp_size
>=
2
:
self
.
mem_fraction_static
=
0.87
else
:
self
.
mem_fraction_static
=
0.88
if
self
.
tp_size
>=
16
:
self
.
mem_fraction_static
=
0.79
elif
self
.
tp_size
>=
8
:
self
.
mem_fraction_static
=
0.81
elif
self
.
tp_size
>=
4
:
self
.
mem_fraction_static
=
0.85
elif
self
.
tp_size
>=
2
:
self
.
mem_fraction_static
=
0.87
else
:
# FIXME: more fine grained auto-selection polices
self
.
mem_fraction_static
=
(
gpu_mem
-
1024
*
13
)
/
gpu_mem
self
.
mem_fraction_static
=
0.88
if
gpu_mem
>
96
*
1024
:
mem_fraction
=
self
.
mem_fraction_static
self
.
mem_fraction_static
=
min
(
mem_fraction
+
48
*
1024
*
(
1
-
mem_fraction
)
/
gpu_mem
,
(
gpu_mem
-
1024
*
18
)
/
gpu_mem
,
# 15 GB + additional 3GB for cuda graph
)
# Set chunked prefill size, which depends on the gpu memory capacity
if
self
.
chunked_prefill_size
is
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment