Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
94e167ea
"vscode:/vscode.git/clone" did not exist on "e444c13fb4c129d4429f7b914cda44657c0b1c8e"
Unverified
Commit
94e167ea
authored
Nov 29, 2024
by
Lianmin Zheng
Committed by
GitHub
Nov 29, 2024
Browse files
Fix the default chunked prefill size (#2268)
parent
262e370f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
29 additions
and
20 deletions
+29
-20
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+2
-0
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+6
-3
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+21
-17
No files found.
python/sglang/srt/managers/scheduler.py
View file @
94e167ea
...
...
@@ -253,6 +253,8 @@ class Scheduler:
# Init chunked prefill
self
.
chunked_prefill_size
=
server_args
.
chunked_prefill_size
if
self
.
chunked_prefill_size
<=
0
:
# -1 means disable
self
.
chunked_prefill_size
=
None
self
.
being_chunked_req
=
None
self
.
is_mixed_chunk
=
(
self
.
chunked_prefill_size
is
not
None
and
server_args
.
enable_mixed_chunk
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
94e167ea
...
...
@@ -118,7 +118,7 @@ class ModelRunner:
logger
.
info
(
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
server_args
.
chunked_prefill_size
=
None
server_args
.
chunked_prefill_size
=
-
1
self
.
mem_fraction_static
*=
0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if
self
.
model_config
.
hf_config
.
architectures
==
[
...
...
@@ -148,12 +148,14 @@ class ModelRunner:
set_cpu_offload_max_bytes
(
int
(
server_args
.
cpu_offload_gb
*
1024
**
3
))
#
Init components
#
Get memory before model loading
min_per_gpu_memory
=
self
.
init_torch_distributed
()
# Load the model
self
.
sampler
=
Sampler
()
self
.
load_model
()
# Apply torch TP if model supports it
# Apply torch TP if
the
model supports it
supports_torch_tp
=
getattr
(
self
.
model
,
"supports_torch_tp"
,
False
)
if
self
.
tp_size
>
1
and
supports_torch_tp
:
self
.
apply_torch_tp
()
...
...
@@ -161,6 +163,7 @@ class ModelRunner:
else
:
self
.
torch_tp_applied
=
False
# Init memory pool and attention backends
if
server_args
.
lora_paths
is
not
None
:
self
.
init_lora_manager
()
self
.
init_memory_pool
(
...
...
python/sglang/srt/server_args.py
View file @
94e167ea
...
...
@@ -58,7 +58,7 @@ class ServerArgs:
mem_fraction_static
:
Optional
[
float
]
=
None
max_running_requests
:
Optional
[
int
]
=
None
max_total_tokens
:
Optional
[
int
]
=
None
chunked_prefill_size
:
int
=
8192
chunked_prefill_size
:
Optional
[
int
]
=
None
max_prefill_tokens
:
int
=
16384
schedule_policy
:
str
=
"lpm"
schedule_conservativeness
:
float
=
1.0
...
...
@@ -128,7 +128,7 @@ class ServerArgs:
enable_dp_attention
:
bool
=
False
enable_torch_compile
:
bool
=
False
torch_compile_max_bs
:
int
=
32
cuda_graph_max_bs
:
int
=
160
cuda_graph_max_bs
:
Optional
[
int
]
=
None
torchao_config
:
str
=
""
enable_nan_detection
:
bool
=
False
enable_p2p_check
:
bool
=
False
...
...
@@ -144,14 +144,15 @@ class ServerArgs:
if
self
.
served_model_name
is
None
:
self
.
served_model_name
=
self
.
model_path
if
self
.
chunked_prefill_size
is
not
None
and
self
.
chunked_prefill_size
<=
0
:
# Disable chunked prefill
self
.
chunked_prefill_size
=
None
if
self
.
random_seed
is
None
:
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
# Mem fraction depends on the tensor parallelism size
if
is_hip
():
gpu_mem
=
get_amdgpu_memory_capacity
()
else
:
gpu_mem
=
get_nvgpu_memory_capacity
()
# Set mem fraction static, which depends on the tensor parallelism size
if
self
.
mem_fraction_static
is
None
:
if
self
.
tp_size
>=
16
:
self
.
mem_fraction_static
=
0.79
...
...
@@ -164,18 +165,21 @@ class ServerArgs:
else
:
self
.
mem_fraction_static
=
0.88
# Adjust for GPUs with small memory capacities
if
is_hip
():
gpu_mem
=
get_amdgpu_memory_capacity
()
# Set chunked prefill size, which depends on the gpu memory capacity
if
self
.
chunked_prefill_size
is
None
:
if
gpu_mem
<
25_000
:
self
.
chunked_prefill_size
=
2048
else
:
gpu_mem
=
get_nvgpu_memory_capacity
()
self
.
chunked_prefill_size
=
8192
if
gpu_mem
<
25000
:
logger
.
warning
(
"Your GPU has less than 25GB memory. You may want to set a smaller --chunked-prefill-size (e.g., 512) to improve performance."
)
# Set cuda graph max batch size
if
self
.
cuda_graph_max_bs
is
None
:
if
gpu_mem
<
25_000
:
self
.
cuda_graph_max_bs
=
8
else
:
self
.
cuda_graph_max_bs
=
160
#
Choose
kernel backends
#
Set
kernel backends
if
not
is_flashinfer_available
():
self
.
attention_backend
=
"triton"
self
.
sampling_backend
=
"pytorch"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment