Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
94e167ea
Unverified
Commit
94e167ea
authored
Nov 29, 2024
by
Lianmin Zheng
Committed by
GitHub
Nov 29, 2024
Browse files
Fix the default chunked prefill size (#2268)
parent
262e370f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
29 additions
and
20 deletions
+29
-20
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+2
-0
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+6
-3
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+21
-17
No files found.
python/sglang/srt/managers/scheduler.py
View file @
94e167ea
...
@@ -253,6 +253,8 @@ class Scheduler:
...
@@ -253,6 +253,8 @@ class Scheduler:
# Init chunked prefill
# Init chunked prefill
self
.
chunked_prefill_size
=
server_args
.
chunked_prefill_size
self
.
chunked_prefill_size
=
server_args
.
chunked_prefill_size
if
self
.
chunked_prefill_size
<=
0
:
# -1 means disable
self
.
chunked_prefill_size
=
None
self
.
being_chunked_req
=
None
self
.
being_chunked_req
=
None
self
.
is_mixed_chunk
=
(
self
.
is_mixed_chunk
=
(
self
.
chunked_prefill_size
is
not
None
and
server_args
.
enable_mixed_chunk
self
.
chunked_prefill_size
is
not
None
and
server_args
.
enable_mixed_chunk
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
94e167ea
...
@@ -118,7 +118,7 @@ class ModelRunner:
...
@@ -118,7 +118,7 @@ class ModelRunner:
logger
.
info
(
logger
.
info
(
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
)
server_args
.
chunked_prefill_size
=
None
server_args
.
chunked_prefill_size
=
-
1
self
.
mem_fraction_static
*=
0.95
self
.
mem_fraction_static
*=
0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if
self
.
model_config
.
hf_config
.
architectures
==
[
if
self
.
model_config
.
hf_config
.
architectures
==
[
...
@@ -148,12 +148,14 @@ class ModelRunner:
...
@@ -148,12 +148,14 @@ class ModelRunner:
set_cpu_offload_max_bytes
(
int
(
server_args
.
cpu_offload_gb
*
1024
**
3
))
set_cpu_offload_max_bytes
(
int
(
server_args
.
cpu_offload_gb
*
1024
**
3
))
#
Init components
#
Get memory before model loading
min_per_gpu_memory
=
self
.
init_torch_distributed
()
min_per_gpu_memory
=
self
.
init_torch_distributed
()
# Load the model
self
.
sampler
=
Sampler
()
self
.
sampler
=
Sampler
()
self
.
load_model
()
self
.
load_model
()
# Apply torch TP if model supports it
# Apply torch TP if
the
model supports it
supports_torch_tp
=
getattr
(
self
.
model
,
"supports_torch_tp"
,
False
)
supports_torch_tp
=
getattr
(
self
.
model
,
"supports_torch_tp"
,
False
)
if
self
.
tp_size
>
1
and
supports_torch_tp
:
if
self
.
tp_size
>
1
and
supports_torch_tp
:
self
.
apply_torch_tp
()
self
.
apply_torch_tp
()
...
@@ -161,6 +163,7 @@ class ModelRunner:
...
@@ -161,6 +163,7 @@ class ModelRunner:
else
:
else
:
self
.
torch_tp_applied
=
False
self
.
torch_tp_applied
=
False
# Init memory pool and attention backends
if
server_args
.
lora_paths
is
not
None
:
if
server_args
.
lora_paths
is
not
None
:
self
.
init_lora_manager
()
self
.
init_lora_manager
()
self
.
init_memory_pool
(
self
.
init_memory_pool
(
...
...
python/sglang/srt/server_args.py
View file @
94e167ea
...
@@ -58,7 +58,7 @@ class ServerArgs:
...
@@ -58,7 +58,7 @@ class ServerArgs:
mem_fraction_static
:
Optional
[
float
]
=
None
mem_fraction_static
:
Optional
[
float
]
=
None
max_running_requests
:
Optional
[
int
]
=
None
max_running_requests
:
Optional
[
int
]
=
None
max_total_tokens
:
Optional
[
int
]
=
None
max_total_tokens
:
Optional
[
int
]
=
None
chunked_prefill_size
:
int
=
8192
chunked_prefill_size
:
Optional
[
int
]
=
None
max_prefill_tokens
:
int
=
16384
max_prefill_tokens
:
int
=
16384
schedule_policy
:
str
=
"lpm"
schedule_policy
:
str
=
"lpm"
schedule_conservativeness
:
float
=
1.0
schedule_conservativeness
:
float
=
1.0
...
@@ -128,7 +128,7 @@ class ServerArgs:
...
@@ -128,7 +128,7 @@ class ServerArgs:
enable_dp_attention
:
bool
=
False
enable_dp_attention
:
bool
=
False
enable_torch_compile
:
bool
=
False
enable_torch_compile
:
bool
=
False
torch_compile_max_bs
:
int
=
32
torch_compile_max_bs
:
int
=
32
cuda_graph_max_bs
:
int
=
160
cuda_graph_max_bs
:
Optional
[
int
]
=
None
torchao_config
:
str
=
""
torchao_config
:
str
=
""
enable_nan_detection
:
bool
=
False
enable_nan_detection
:
bool
=
False
enable_p2p_check
:
bool
=
False
enable_p2p_check
:
bool
=
False
...
@@ -144,14 +144,15 @@ class ServerArgs:
...
@@ -144,14 +144,15 @@ class ServerArgs:
if
self
.
served_model_name
is
None
:
if
self
.
served_model_name
is
None
:
self
.
served_model_name
=
self
.
model_path
self
.
served_model_name
=
self
.
model_path
if
self
.
chunked_prefill_size
is
not
None
and
self
.
chunked_prefill_size
<=
0
:
# Disable chunked prefill
self
.
chunked_prefill_size
=
None
if
self
.
random_seed
is
None
:
if
self
.
random_seed
is
None
:
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
# Mem fraction depends on the tensor parallelism size
if
is_hip
():
gpu_mem
=
get_amdgpu_memory_capacity
()
else
:
gpu_mem
=
get_nvgpu_memory_capacity
()
# Set mem fraction static, which depends on the tensor parallelism size
if
self
.
mem_fraction_static
is
None
:
if
self
.
mem_fraction_static
is
None
:
if
self
.
tp_size
>=
16
:
if
self
.
tp_size
>=
16
:
self
.
mem_fraction_static
=
0.79
self
.
mem_fraction_static
=
0.79
...
@@ -164,18 +165,21 @@ class ServerArgs:
...
@@ -164,18 +165,21 @@ class ServerArgs:
else
:
else
:
self
.
mem_fraction_static
=
0.88
self
.
mem_fraction_static
=
0.88
# Adjust for GPUs with small memory capacities
# Set chunked prefill size, which depends on the gpu memory capacity
if
is_hip
():
if
self
.
chunked_prefill_size
is
None
:
gpu_mem
=
get_amdgpu_memory_capacity
()
if
gpu_mem
<
25_000
:
else
:
self
.
chunked_prefill_size
=
2048
gpu_mem
=
get_nvgpu_memory_capacity
()
else
:
self
.
chunked_prefill_size
=
8192
if
gpu_mem
<
25000
:
# Set cuda graph max batch size
logger
.
warning
(
if
self
.
cuda_graph_max_bs
is
None
:
"Your GPU has less than 25GB memory. You may want to set a smaller --chunked-prefill-size (e.g., 512) to improve performance."
if
gpu_mem
<
25_000
:
)
self
.
cuda_graph_max_bs
=
8
else
:
self
.
cuda_graph_max_bs
=
160
#
Choose
kernel backends
#
Set
kernel backends
if
not
is_flashinfer_available
():
if
not
is_flashinfer_available
():
self
.
attention_backend
=
"triton"
self
.
attention_backend
=
"triton"
self
.
sampling_backend
=
"pytorch"
self
.
sampling_backend
=
"pytorch"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment