Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
86e0dde5
"vscode:/vscode.git/clone" did not exist on "ad4e0bf3be5c11b8bdf79a0523538f1b0d43784f"
Unverified
Commit
86e0dde5
authored
Oct 26, 2024
by
Lianmin Zheng
Committed by
GitHub
Oct 26, 2024
Browse files
Improve the user control of new_token_ratio (#1811)
parent
2b809788
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
17 deletions
+32
-17
python/sglang/global_config.py
python/sglang/global_config.py
+9
-3
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+15
-6
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+2
-2
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+6
-6
No files found.
python/sglang/global_config.py
View file @
86e0dde5
...
...
@@ -14,9 +14,15 @@ class GlobalConfig:
self
.
default_backend
=
None
# Runtime constants: New generation token ratio estimation
self
.
init_new_token_ratio
=
0.7
self
.
base_min_new_token_ratio
=
0.1
self
.
new_token_ratio_decay
=
0.001
self
.
default_init_new_token_ratio
=
float
(
os
.
environ
.
get
(
"SGLANG_INIT_NEW_TOKEN_RATIO"
,
0.7
)
)
self
.
default_min_new_token_ratio_factor
=
float
(
os
.
environ
.
get
(
"SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR"
,
0.14
)
)
self
.
default_new_token_ratio_decay_steps
=
float
(
os
.
environ
.
get
(
"SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS"
,
600
)
)
# Runtime constants: others
self
.
retract_decode_steps
=
20
...
...
python/sglang/srt/managers/scheduler.py
View file @
86e0dde5
...
...
@@ -254,13 +254,22 @@ class Scheduler:
assert
(
server_args
.
schedule_conservativeness
>=
0
),
"Invalid schedule_conservativeness"
self
.
min_new_token_ratio
=
min
(
global_config
.
base_min_new_token_ratio
self
.
init_new_token_ratio
=
min
(
global_config
.
default_init_new_token_ratio
*
server_args
.
schedule_conservativeness
,
1.0
,
)
self
.
new_token_ratio
=
self
.
min_new_token_ratio
self
.
new_token_ratio_decay
=
global_config
.
new_token_ratio_decay
self
.
min_new_token_ratio
=
min
(
self
.
init_new_token_ratio
*
global_config
.
default_min_new_token_ratio_factor
,
1.0
,
)
self
.
new_token_ratio_decay
=
(
self
.
init_new_token_ratio
-
self
.
min_new_token_ratio
)
/
global_config
.
default_new_token_ratio_decay_steps
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
batch_is_full
=
False
# Init profiler
...
...
@@ -307,7 +316,7 @@ class Scheduler:
self
.
process_batch_result
(
batch
,
result
)
else
:
self
.
check_memory
()
self
.
new_token_ratio
=
global_config
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
last_batch
=
batch
...
...
@@ -334,7 +343,7 @@ class Scheduler:
self
.
process_batch_result
(
tmp_batch
,
tmp_result
)
elif
batch
is
None
:
self
.
check_memory
()
self
.
new_token_ratio
=
global_config
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
last_batch
=
batch
...
...
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
86e0dde5
...
...
@@ -121,13 +121,13 @@ class CudaGraphRunner:
bs
for
bs
in
self
.
capture_bs
if
bs
<=
model_runner
.
req_to_token_pool
.
size
and
bs
<=
model_runner
.
server_args
.
max_
cuda_graph_bs
and
bs
<=
model_runner
.
server_args
.
cuda_graph_
max_
bs
]
self
.
compile_bs
=
(
[
bs
for
bs
in
self
.
capture_bs
if
bs
<=
self
.
model_runner
.
server_args
.
max_
torch_compile_bs
if
bs
<=
self
.
model_runner
.
server_args
.
torch_compile_
max_
bs
]
if
self
.
use_torch_compile
else
[]
...
...
python/sglang/srt/server_args.py
View file @
86e0dde5
...
...
@@ -119,8 +119,8 @@ class ServerArgs:
enable_overlap_schedule
:
bool
=
False
enable_mixed_chunk
:
bool
=
False
enable_torch_compile
:
bool
=
False
max_
torch_compile_bs
:
int
=
32
max_
cuda_graph_bs
:
int
=
160
torch_compile_
max_
bs
:
int
=
32
cuda_graph_
max_
bs
:
int
=
160
torchao_config
:
str
=
""
enable_p2p_check
:
bool
=
False
triton_attention_reduce_in_fp32
:
bool
=
False
...
...
@@ -620,15 +620,15 @@ class ServerArgs:
help
=
"Optimize the model with torch.compile. Experimental feature."
,
)
parser
.
add_argument
(
"--
max-
torch-compile-bs"
,
"--torch-compile-
max-
bs"
,
type
=
int
,
default
=
ServerArgs
.
max_
torch_compile_bs
,
default
=
ServerArgs
.
torch_compile_
max_
bs
,
help
=
"Set the maximum batch size when using torch compile."
,
)
parser
.
add_argument
(
"--
max-
cuda-graph-bs"
,
"--cuda-graph-
max-
bs"
,
type
=
int
,
default
=
ServerArgs
.
max_
cuda_graph_bs
,
default
=
ServerArgs
.
cuda_graph_
max_
bs
,
help
=
"Set the maximum batch size for cuda graph."
,
)
parser
.
add_argument
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment