Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
13432002
"...text-generation-inference.git" did not exist on "84bc3d7b7d65586f7f249b0e9065588b93e7cab3"
Unverified
Commit
13432002
authored
Apr 21, 2025
by
Lianmin Zheng
Committed by
GitHub
Apr 21, 2025
Browse files
Clean up mem settings (#5610)
parent
c2942907
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6 additions
and
12 deletions
+6
-12
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+3
-9
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+2
-2
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+1
-1
No files found.
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
13432002
...
@@ -37,7 +37,7 @@ from sglang.srt.model_executor.forward_batch_info import (
...
@@ -37,7 +37,7 @@ from sglang.srt.model_executor.forward_batch_info import (
from
sglang.srt.patch_torch
import
monkey_patch_torch_compile
from
sglang.srt.patch_torch
import
monkey_patch_torch_compile
from
sglang.srt.utils
import
(
from
sglang.srt.utils
import
(
get_available_gpu_memory
,
get_available_gpu_memory
,
get_
whatever_gpu
_memory_capacity
,
get_
device
_memory_capacity
,
is_hip
,
is_hip
,
)
)
...
@@ -133,14 +133,10 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
...
@@ -133,14 +133,10 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
list
(
range
(
1
,
9
))
+
list
(
range
(
10
,
33
,
2
))
+
list
(
range
(
40
,
161
,
16
))
list
(
range
(
1
,
9
))
+
list
(
range
(
10
,
33
,
2
))
+
list
(
range
(
40
,
161
,
16
))
)
)
if
_is_hip
:
gpu_mem
=
get_device_memory_capacity
()
if
gpu_mem
is
not
None
and
gpu_mem
>
81920
:
capture_bs
+=
list
(
range
(
160
,
257
,
8
))
capture_bs
+=
list
(
range
(
160
,
257
,
8
))
gpu_mem
=
get_whatever_gpu_memory_capacity
()
/
1024
if
gpu_mem
is
not
None
and
gpu_mem
>
120
:
capture_bs
+=
list
(
range
(
160
,
256
,
8
))
if
max
(
capture_bs
)
>
model_runner
.
req_to_token_pool
.
size
:
if
max
(
capture_bs
)
>
model_runner
.
req_to_token_pool
.
size
:
# In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
# In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
# is very small. We add more values here to make sure we capture the maximum bs.
# is very small. We add more values here to make sure we capture the maximum bs.
...
@@ -152,10 +148,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
...
@@ -152,10 +148,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
assert
len
(
capture_bs
)
>
0
and
capture_bs
[
0
]
>
0
assert
len
(
capture_bs
)
>
0
and
capture_bs
[
0
]
>
0
capture_bs
=
[
bs
for
bs
in
capture_bs
if
bs
<=
model_runner
.
req_to_token_pool
.
size
]
capture_bs
=
[
bs
for
bs
in
capture_bs
if
bs
<=
model_runner
.
req_to_token_pool
.
size
]
if
server_args
.
cuda_graph_max_bs
:
if
server_args
.
cuda_graph_max_bs
:
capture_bs
=
[
bs
for
bs
in
capture_bs
if
bs
<=
server_args
.
cuda_graph_max_bs
]
capture_bs
=
[
bs
for
bs
in
capture_bs
if
bs
<=
server_args
.
cuda_graph_max_bs
]
compile_bs
=
(
compile_bs
=
(
[
bs
for
bs
in
capture_bs
if
bs
<=
server_args
.
torch_compile_max_bs
]
[
bs
for
bs
in
capture_bs
if
bs
<=
server_args
.
torch_compile_max_bs
]
if
server_args
.
enable_torch_compile
if
server_args
.
enable_torch_compile
...
...
python/sglang/srt/server_args.py
View file @
13432002
...
@@ -27,7 +27,7 @@ from sglang.srt.reasoning_parser import ReasoningParser
...
@@ -27,7 +27,7 @@ from sglang.srt.reasoning_parser import ReasoningParser
from
sglang.srt.utils
import
(
from
sglang.srt.utils
import
(
configure_ipv6
,
configure_ipv6
,
get_device
,
get_device
,
get_
whatever_gpu
_memory_capacity
,
get_
device
_memory_capacity
,
is_flashinfer_available
,
is_flashinfer_available
,
is_hip
,
is_hip
,
is_port_available
,
is_port_available
,
...
@@ -218,7 +218,7 @@ class ServerArgs:
...
@@ -218,7 +218,7 @@ class ServerArgs:
if
self
.
random_seed
is
None
:
if
self
.
random_seed
is
None
:
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
gpu_mem
=
get_
whatever_gpu
_memory_capacity
(
self
.
device
)
gpu_mem
=
get_
device
_memory_capacity
(
self
.
device
)
# Set mem fraction static, which depends on the tensor parallelism size
# Set mem fraction static, which depends on the tensor parallelism size
if
self
.
mem_fraction_static
is
None
:
if
self
.
mem_fraction_static
is
None
:
...
...
python/sglang/srt/utils.py
View file @
13432002
...
@@ -1170,7 +1170,7 @@ def get_hpu_memory_capacity():
...
@@ -1170,7 +1170,7 @@ def get_hpu_memory_capacity():
)
)
def
get_
whatever_gpu
_memory_capacity
(
device
:
str
=
None
):
def
get_
device
_memory_capacity
(
device
:
str
=
None
):
if
is_cuda
():
if
is_cuda
():
gpu_mem
=
get_nvgpu_memory_capacity
()
gpu_mem
=
get_nvgpu_memory_capacity
()
elif
is_hip
():
elif
is_hip
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment