Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f9afa7dc
Unverified
Commit
f9afa7dc
authored
Aug 11, 2025
by
Liangsheng Yin
Committed by
GitHub
Aug 11, 2025
Browse files
Fix docs for clip max new tokens (#9082)
parent
0d9e89ec
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
10 additions
and
13 deletions
+10
-13
docs/advanced_features/pd_disaggregation.md
docs/advanced_features/pd_disaggregation.md
+0
-1
docs/references/environment_variables.md
docs/references/environment_variables.md
+1
-1
python/sglang/srt/disaggregation/decode.py
python/sglang/srt/disaggregation/decode.py
+3
-5
python/sglang/srt/managers/schedule_policy.py
python/sglang/srt/managers/schedule_policy.py
+6
-6
No files found.
docs/advanced_features/pd_disaggregation.md
View file @
f9afa7dc
...
@@ -67,7 +67,6 @@ Please be aware that this setting will cause prefill instances to take a longer
...
@@ -67,7 +67,6 @@ Please be aware that this setting will cause prefill instances to take a longer
|
**`SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL`**
| Interval (seconds) between health checks to prefill bootstrap servers |
`5.0`
|
|
**`SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL`**
| Interval (seconds) between health checks to prefill bootstrap servers |
`5.0`
|
|
**`SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE`**
| Consecutive heartbeat failures before marking prefill server offline |
`2`
|
|
**`SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE`**
| Consecutive heartbeat failures before marking prefill server offline |
`2`
|
|
**`SGLANG_DISAGGREGATION_WAITING_TIMEOUT`**
| Timeout (seconds) for receiving KV Cache after request initialization |
`300`
|
|
**`SGLANG_DISAGGREGATION_WAITING_TIMEOUT`**
| Timeout (seconds) for receiving KV Cache after request initialization |
`300`
|
|
**`SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION`**
| Clip request param "max_tokens" to pre_allocate |
`4096`
|
If a greater mean TTFT is acceptable, you can
`export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600`
(10 minutes) to relax the timeout condition.
If a greater mean TTFT is acceptable, you can
`export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600`
(10 minutes) to relax the timeout condition.
...
...
docs/references/environment_variables.md
View file @
f9afa7dc
...
@@ -45,7 +45,7 @@ SGLang supports various environment variables that can be used to configure its
...
@@ -45,7 +45,7 @@ SGLang supports various environment variables that can be used to configure its
| Environment Variable | Description | Default Value |
| Environment Variable | Description | Default Value |
| --- | --- | --- |
| --- | --- | --- |
|
`SGLANG_DEBUG_MEMORY_POOL`
| Enable memory pool debugging |
`false`
|
|
`SGLANG_DEBUG_MEMORY_POOL`
| Enable memory pool debugging |
`false`
|
|
`SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION`
| Clip max new tokens estimation for memory planning |
Not set
|
|
`SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION`
| Clip max new tokens estimation for memory planning |
`4096`
|
|
`SGLANG_DETOKENIZER_MAX_STATES`
| Maximum states for detokenizer | Default value based on system |
|
`SGLANG_DETOKENIZER_MAX_STATES`
| Maximum states for detokenizer | Default value based on system |
|
`SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK`
| Disable checks for memory imbalance across Tensor Parallel ranks | Not set (defaults to enabled check) |
|
`SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK`
| Disable checks for memory imbalance across Tensor Parallel ranks | Not set (defaults to enabled check) |
...
...
python/sglang/srt/disaggregation/decode.py
View file @
f9afa7dc
...
@@ -59,9 +59,7 @@ if TYPE_CHECKING:
...
@@ -59,9 +59,7 @@ if TYPE_CHECKING:
from
sglang.srt.managers.schedule_batch
import
Req
from
sglang.srt.managers.schedule_batch
import
Req
from
sglang.srt.managers.scheduler
import
Scheduler
from
sglang.srt.managers.scheduler
import
Scheduler
DECODE_CLIP_MAX_NEW_TOKEN
=
get_int_env_var
(
CLIP_MAX_NEW_TOKEN
=
get_int_env_var
(
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION"
,
4096
)
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION"
,
4096
)
class
DecodeReqToTokenPool
:
class
DecodeReqToTokenPool
:
...
@@ -390,7 +388,7 @@ class DecodePreallocQueue:
...
@@ -390,7 +388,7 @@ class DecodePreallocQueue:
origin_input_len
origin_input_len
+
min
(
+
min
(
decode_req
.
req
.
sampling_params
.
max_new_tokens
,
decode_req
.
req
.
sampling_params
.
max_new_tokens
,
DECODE_
CLIP_MAX_NEW_TOKEN
,
CLIP_MAX_NEW_TOKEN
,
)
)
-
retractable_tokens
,
-
retractable_tokens
,
)
)
...
@@ -440,7 +438,7 @@ class DecodePreallocQueue:
...
@@ -440,7 +438,7 @@ class DecodePreallocQueue:
need_space_for_single_req
=
(
need_space_for_single_req
=
(
max
(
max
(
[
[
min
(
x
.
sampling_params
.
max_new_tokens
,
DECODE_
CLIP_MAX_NEW_TOKEN
)
min
(
x
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKEN
)
+
len
(
x
.
origin_input_ids
)
+
len
(
x
.
origin_input_ids
)
-
retractable_tokens
-
retractable_tokens
for
x
in
self
.
scheduler
.
running_batch
.
reqs
for
x
in
self
.
scheduler
.
running_batch
.
reqs
...
...
python/sglang/srt/managers/schedule_policy.py
View file @
f9afa7dc
...
@@ -36,7 +36,7 @@ if TYPE_CHECKING:
...
@@ -36,7 +36,7 @@ if TYPE_CHECKING:
# This can prevent the server from being too conservative.
# This can prevent the server from being too conservative.
# Note that this only clips the estimation in the scheduler but does not change the stop
# Note that this only clips the estimation in the scheduler but does not change the stop
# condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
# condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
CLIP_MAX_NEW_TOKENS
_ESTIMATION
=
int
(
CLIP_MAX_NEW_TOKENS
=
int
(
os
.
environ
.
get
(
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION"
,
"4096"
)
os
.
environ
.
get
(
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION"
,
"4096"
)
)
)
...
@@ -305,7 +305,7 @@ class PrefillAdder:
...
@@ -305,7 +305,7 @@ class PrefillAdder:
[
[
min
(
min
(
(
r
.
sampling_params
.
max_new_tokens
-
len
(
r
.
output_ids
)),
(
r
.
sampling_params
.
max_new_tokens
-
len
(
r
.
output_ids
)),
CLIP_MAX_NEW_TOKENS
_ESTIMATION
,
CLIP_MAX_NEW_TOKENS
,
)
)
*
self
.
new_token_ratio
*
self
.
new_token_ratio
for
r
in
running_batch
.
reqs
for
r
in
running_batch
.
reqs
...
@@ -388,7 +388,7 @@ class PrefillAdder:
...
@@ -388,7 +388,7 @@ class PrefillAdder:
0
,
0
,
req
.
extend_input_len
,
req
.
extend_input_len
,
(
(
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
_ESTIMATION
)
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
)
if
not
truncated
if
not
truncated
else
0
else
0
),
),
...
@@ -477,7 +477,7 @@ class PrefillAdder:
...
@@ -477,7 +477,7 @@ class PrefillAdder:
self
.
_update_prefill_budget
(
self
.
_update_prefill_budget
(
0
,
0
,
req
.
extend_input_len
,
req
.
extend_input_len
,
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
_ESTIMATION
),
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
),
)
)
else
:
else
:
if
self
.
rem_chunk_tokens
==
0
:
if
self
.
rem_chunk_tokens
==
0
:
...
@@ -499,7 +499,7 @@ class PrefillAdder:
...
@@ -499,7 +499,7 @@ class PrefillAdder:
return
self
.
add_one_req_ignore_eos
(
req
,
has_chunked_req
)
return
self
.
add_one_req_ignore_eos
(
req
,
has_chunked_req
)
total_tokens
=
req
.
extend_input_len
+
min
(
total_tokens
=
req
.
extend_input_len
+
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
_ESTIMATION
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
)
)
# adjusting the input_tokens based on host_hit_length and page_size
# adjusting the input_tokens based on host_hit_length and page_size
...
@@ -544,7 +544,7 @@ class PrefillAdder:
...
@@ -544,7 +544,7 @@ class PrefillAdder:
input_tokens
,
input_tokens
,
min
(
min
(
req
.
sampling_params
.
max_new_tokens
,
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
_ESTIMATION
,
CLIP_MAX_NEW_TOKENS
,
),
),
)
)
else
:
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment