Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f2bd3515
"examples/vscode:/vscode.git/clone" did not exist on "62c01d267a74f1bddfcdad33eabdf316a50fb613"
Unverified
Commit
f2bd3515
authored
May 29, 2025
by
Baizhou Zhang
Committed by
GitHub
May 29, 2025
Browse files
Tune memory arguments on B200 (#6718)
parent
c459536b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
8 additions
and
2 deletions
+8
-2
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+2
-0
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+6
-2
No files found.
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
f2bd3515
...
...
@@ -149,6 +149,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
gpu_mem
=
get_device_memory_capacity
()
if
gpu_mem
is
not
None
and
gpu_mem
>
96
*
1024
:
capture_bs
+=
list
(
range
(
160
,
257
,
8
))
if
gpu_mem
is
not
None
and
gpu_mem
>
180
*
1000
:
capture_bs
+=
list
(
range
(
256
,
528
,
16
))
if
max
(
capture_bs
)
>
model_runner
.
req_to_token_pool
.
size
:
# In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
...
...
python/sglang/srt/server_args.py
View file @
f2bd3515
...
...
@@ -260,7 +260,9 @@ class ServerArgs:
self
.
mem_fraction_static
=
0.88
else
:
self
.
mem_fraction_static
=
0.88
if
gpu_mem
is
not
None
and
gpu_mem
>
96
*
1024
:
if
gpu_mem
is
not
None
and
gpu_mem
>
180
*
1000
:
self
.
mem_fraction_static
=
0.79
elif
gpu_mem
is
not
None
and
gpu_mem
>
96
*
1024
:
mem_fraction
=
self
.
mem_fraction_static
# 15 GB + additional 3GB for cuda graph
reserve_mem
=
1024
*
18
...
...
@@ -277,7 +279,9 @@ class ServerArgs:
# Set chunked prefill size, which depends on the gpu memory capacity
if
self
.
chunked_prefill_size
is
None
:
if
gpu_mem
is
not
None
and
gpu_mem
<
25_000
:
if
gpu_mem
is
not
None
and
gpu_mem
>
180_000
:
self
.
chunked_prefill_size
=
16384
elif
gpu_mem
is
not
None
and
gpu_mem
<
25_000
:
self
.
chunked_prefill_size
=
2048
elif
self
.
disaggregation_mode
!=
"null"
:
self
.
chunked_prefill_size
=
16384
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment