Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
13432002
Unverified
Commit
13432002
authored
Apr 21, 2025
by
Lianmin Zheng
Committed by
GitHub
Apr 21, 2025
Browse files
Clean up mem settings (#5610)
parent
c2942907
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6 additions
and
12 deletions
+6
-12
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+3
-9
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+2
-2
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+1
-1
No files found.
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
13432002
...
@@ -37,7 +37,7 @@ from sglang.srt.model_executor.forward_batch_info import (
...
@@ -37,7 +37,7 @@ from sglang.srt.model_executor.forward_batch_info import (
from
sglang.srt.patch_torch
import
monkey_patch_torch_compile
from
sglang.srt.patch_torch
import
monkey_patch_torch_compile
from
sglang.srt.utils
import
(
from
sglang.srt.utils
import
(
get_available_gpu_memory
,
get_available_gpu_memory
,
get_
whatever_gpu
_memory_capacity
,
get_
device
_memory_capacity
,
is_hip
,
is_hip
,
)
)
...
@@ -133,14 +133,10 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
...
@@ -133,14 +133,10 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
list
(
range
(
1
,
9
))
+
list
(
range
(
10
,
33
,
2
))
+
list
(
range
(
40
,
161
,
16
))
list
(
range
(
1
,
9
))
+
list
(
range
(
10
,
33
,
2
))
+
list
(
range
(
40
,
161
,
16
))
)
)
if
_is_hip
:
gpu_mem
=
get_device_memory_capacity
()
if
gpu_mem
is
not
None
and
gpu_mem
>
81920
:
capture_bs
+=
list
(
range
(
160
,
257
,
8
))
capture_bs
+=
list
(
range
(
160
,
257
,
8
))
gpu_mem
=
get_whatever_gpu_memory_capacity
()
/
1024
if
gpu_mem
is
not
None
and
gpu_mem
>
120
:
capture_bs
+=
list
(
range
(
160
,
256
,
8
))
if
max
(
capture_bs
)
>
model_runner
.
req_to_token_pool
.
size
:
if
max
(
capture_bs
)
>
model_runner
.
req_to_token_pool
.
size
:
# In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
# In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
# is very small. We add more values here to make sure we capture the maximum bs.
# is very small. We add more values here to make sure we capture the maximum bs.
...
@@ -152,10 +148,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
...
@@ -152,10 +148,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
assert
len
(
capture_bs
)
>
0
and
capture_bs
[
0
]
>
0
assert
len
(
capture_bs
)
>
0
and
capture_bs
[
0
]
>
0
capture_bs
=
[
bs
for
bs
in
capture_bs
if
bs
<=
model_runner
.
req_to_token_pool
.
size
]
capture_bs
=
[
bs
for
bs
in
capture_bs
if
bs
<=
model_runner
.
req_to_token_pool
.
size
]
if
server_args
.
cuda_graph_max_bs
:
if
server_args
.
cuda_graph_max_bs
:
capture_bs
=
[
bs
for
bs
in
capture_bs
if
bs
<=
server_args
.
cuda_graph_max_bs
]
capture_bs
=
[
bs
for
bs
in
capture_bs
if
bs
<=
server_args
.
cuda_graph_max_bs
]
compile_bs
=
(
compile_bs
=
(
[
bs
for
bs
in
capture_bs
if
bs
<=
server_args
.
torch_compile_max_bs
]
[
bs
for
bs
in
capture_bs
if
bs
<=
server_args
.
torch_compile_max_bs
]
if
server_args
.
enable_torch_compile
if
server_args
.
enable_torch_compile
...
...
python/sglang/srt/server_args.py
View file @
13432002
...
@@ -27,7 +27,7 @@ from sglang.srt.reasoning_parser import ReasoningParser
...
@@ -27,7 +27,7 @@ from sglang.srt.reasoning_parser import ReasoningParser
from
sglang.srt.utils
import
(
from
sglang.srt.utils
import
(
configure_ipv6
,
configure_ipv6
,
get_device
,
get_device
,
get_
whatever_gpu
_memory_capacity
,
get_
device
_memory_capacity
,
is_flashinfer_available
,
is_flashinfer_available
,
is_hip
,
is_hip
,
is_port_available
,
is_port_available
,
...
@@ -218,7 +218,7 @@ class ServerArgs:
...
@@ -218,7 +218,7 @@ class ServerArgs:
if
self
.
random_seed
is
None
:
if
self
.
random_seed
is
None
:
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
gpu_mem
=
get_
whatever_gpu
_memory_capacity
(
self
.
device
)
gpu_mem
=
get_
device
_memory_capacity
(
self
.
device
)
# Set mem fraction static, which depends on the tensor parallelism size
# Set mem fraction static, which depends on the tensor parallelism size
if
self
.
mem_fraction_static
is
None
:
if
self
.
mem_fraction_static
is
None
:
...
...
python/sglang/srt/utils.py
View file @
13432002
...
@@ -1170,7 +1170,7 @@ def get_hpu_memory_capacity():
...
@@ -1170,7 +1170,7 @@ def get_hpu_memory_capacity():
)
)
def
get_
whatever_gpu
_memory_capacity
(
device
:
str
=
None
):
def
get_
device
_memory_capacity
(
device
:
str
=
None
):
if
is_cuda
():
if
is_cuda
():
gpu_mem
=
get_nvgpu_memory_capacity
()
gpu_mem
=
get_nvgpu_memory_capacity
()
elif
is_hip
():
elif
is_hip
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment