Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f9afa7dc
"git@developer.sourcefind.cn:OpenDAS/torch-cluster.git" did not exist on "6a2e1a08b36de013ec3f46bc55f48ca4c9e1482c"
Unverified
Commit
f9afa7dc
authored
Aug 11, 2025
by
Liangsheng Yin
Committed by
GitHub
Aug 11, 2025
Browse files
Fix docs for clip max new tokens (#9082)
parent
0d9e89ec
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
10 additions
and
13 deletions
+10
-13
docs/advanced_features/pd_disaggregation.md
docs/advanced_features/pd_disaggregation.md
+0
-1
docs/references/environment_variables.md
docs/references/environment_variables.md
+1
-1
python/sglang/srt/disaggregation/decode.py
python/sglang/srt/disaggregation/decode.py
+3
-5
python/sglang/srt/managers/schedule_policy.py
python/sglang/srt/managers/schedule_policy.py
+6
-6
No files found.
docs/advanced_features/pd_disaggregation.md
View file @
f9afa7dc
...
@@ -67,7 +67,6 @@ Please be aware that this setting will cause prefill instances to take a longer
...
@@ -67,7 +67,6 @@ Please be aware that this setting will cause prefill instances to take a longer
|
**`SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL`**
| Interval (seconds) between health checks to prefill bootstrap servers |
`5.0`
|
|
**`SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL`**
| Interval (seconds) between health checks to prefill bootstrap servers |
`5.0`
|
|
**`SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE`**
| Consecutive heartbeat failures before marking prefill server offline |
`2`
|
|
**`SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE`**
| Consecutive heartbeat failures before marking prefill server offline |
`2`
|
|
**`SGLANG_DISAGGREGATION_WAITING_TIMEOUT`**
| Timeout (seconds) for receiving KV Cache after request initialization |
`300`
|
|
**`SGLANG_DISAGGREGATION_WAITING_TIMEOUT`**
| Timeout (seconds) for receiving KV Cache after request initialization |
`300`
|
|
**`SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION`**
| Clip request param "max_tokens" to pre_allocate |
`4096`
|
If a greater mean TTFT is acceptable, you can
`export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600`
(10 minutes) to relax the timeout condition.
If a greater mean TTFT is acceptable, you can
`export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600`
(10 minutes) to relax the timeout condition.
...
...
docs/references/environment_variables.md
View file @
f9afa7dc
...
@@ -45,7 +45,7 @@ SGLang supports various environment variables that can be used to configure its
...
@@ -45,7 +45,7 @@ SGLang supports various environment variables that can be used to configure its
| Environment Variable | Description | Default Value |
| Environment Variable | Description | Default Value |
| --- | --- | --- |
| --- | --- | --- |
|
`SGLANG_DEBUG_MEMORY_POOL`
| Enable memory pool debugging |
`false`
|
|
`SGLANG_DEBUG_MEMORY_POOL`
| Enable memory pool debugging |
`false`
|
|
`SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION`
| Clip max new tokens estimation for memory planning |
Not set
|
|
`SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION`
| Clip max new tokens estimation for memory planning |
`4096`
|
|
`SGLANG_DETOKENIZER_MAX_STATES`
| Maximum states for detokenizer | Default value based on system |
|
`SGLANG_DETOKENIZER_MAX_STATES`
| Maximum states for detokenizer | Default value based on system |
|
`SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK`
| Disable checks for memory imbalance across Tensor Parallel ranks | Not set (defaults to enabled check) |
|
`SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK`
| Disable checks for memory imbalance across Tensor Parallel ranks | Not set (defaults to enabled check) |
...
...
python/sglang/srt/disaggregation/decode.py
View file @
f9afa7dc
...
@@ -59,9 +59,7 @@ if TYPE_CHECKING:
...
@@ -59,9 +59,7 @@ if TYPE_CHECKING:
from
sglang.srt.managers.schedule_batch
import
Req
from
sglang.srt.managers.schedule_batch
import
Req
from
sglang.srt.managers.scheduler
import
Scheduler
from
sglang.srt.managers.scheduler
import
Scheduler
DECODE_CLIP_MAX_NEW_TOKEN
=
get_int_env_var
(
CLIP_MAX_NEW_TOKEN
=
get_int_env_var
(
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION"
,
4096
)
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION"
,
4096
)
class
DecodeReqToTokenPool
:
class
DecodeReqToTokenPool
:
...
@@ -390,7 +388,7 @@ class DecodePreallocQueue:
...
@@ -390,7 +388,7 @@ class DecodePreallocQueue:
origin_input_len
origin_input_len
+
min
(
+
min
(
decode_req
.
req
.
sampling_params
.
max_new_tokens
,
decode_req
.
req
.
sampling_params
.
max_new_tokens
,
DECODE_
CLIP_MAX_NEW_TOKEN
,
CLIP_MAX_NEW_TOKEN
,
)
)
-
retractable_tokens
,
-
retractable_tokens
,
)
)
...
@@ -440,7 +438,7 @@ class DecodePreallocQueue:
...
@@ -440,7 +438,7 @@ class DecodePreallocQueue:
need_space_for_single_req
=
(
need_space_for_single_req
=
(
max
(
max
(
[
[
min
(
x
.
sampling_params
.
max_new_tokens
,
DECODE_
CLIP_MAX_NEW_TOKEN
)
min
(
x
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKEN
)
+
len
(
x
.
origin_input_ids
)
+
len
(
x
.
origin_input_ids
)
-
retractable_tokens
-
retractable_tokens
for
x
in
self
.
scheduler
.
running_batch
.
reqs
for
x
in
self
.
scheduler
.
running_batch
.
reqs
...
...
python/sglang/srt/managers/schedule_policy.py
View file @
f9afa7dc
...
@@ -36,7 +36,7 @@ if TYPE_CHECKING:
...
@@ -36,7 +36,7 @@ if TYPE_CHECKING:
# This can prevent the server from being too conservative.
# This can prevent the server from being too conservative.
# Note that this only clips the estimation in the scheduler but does not change the stop
# Note that this only clips the estimation in the scheduler but does not change the stop
# condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
# condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
CLIP_MAX_NEW_TOKENS
_ESTIMATION
=
int
(
CLIP_MAX_NEW_TOKENS
=
int
(
os
.
environ
.
get
(
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION"
,
"4096"
)
os
.
environ
.
get
(
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION"
,
"4096"
)
)
)
...
@@ -305,7 +305,7 @@ class PrefillAdder:
...
@@ -305,7 +305,7 @@ class PrefillAdder:
[
[
min
(
min
(
(
r
.
sampling_params
.
max_new_tokens
-
len
(
r
.
output_ids
)),
(
r
.
sampling_params
.
max_new_tokens
-
len
(
r
.
output_ids
)),
CLIP_MAX_NEW_TOKENS
_ESTIMATION
,
CLIP_MAX_NEW_TOKENS
,
)
)
*
self
.
new_token_ratio
*
self
.
new_token_ratio
for
r
in
running_batch
.
reqs
for
r
in
running_batch
.
reqs
...
@@ -388,7 +388,7 @@ class PrefillAdder:
...
@@ -388,7 +388,7 @@ class PrefillAdder:
0
,
0
,
req
.
extend_input_len
,
req
.
extend_input_len
,
(
(
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
_ESTIMATION
)
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
)
if
not
truncated
if
not
truncated
else
0
else
0
),
),
...
@@ -477,7 +477,7 @@ class PrefillAdder:
...
@@ -477,7 +477,7 @@ class PrefillAdder:
self
.
_update_prefill_budget
(
self
.
_update_prefill_budget
(
0
,
0
,
req
.
extend_input_len
,
req
.
extend_input_len
,
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
_ESTIMATION
),
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
),
)
)
else
:
else
:
if
self
.
rem_chunk_tokens
==
0
:
if
self
.
rem_chunk_tokens
==
0
:
...
@@ -499,7 +499,7 @@ class PrefillAdder:
...
@@ -499,7 +499,7 @@ class PrefillAdder:
return
self
.
add_one_req_ignore_eos
(
req
,
has_chunked_req
)
return
self
.
add_one_req_ignore_eos
(
req
,
has_chunked_req
)
total_tokens
=
req
.
extend_input_len
+
min
(
total_tokens
=
req
.
extend_input_len
+
min
(
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
_ESTIMATION
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
)
)
# adjusting the input_tokens based on host_hit_length and page_size
# adjusting the input_tokens based on host_hit_length and page_size
...
@@ -544,7 +544,7 @@ class PrefillAdder:
...
@@ -544,7 +544,7 @@ class PrefillAdder:
input_tokens
,
input_tokens
,
min
(
min
(
req
.
sampling_params
.
max_new_tokens
,
req
.
sampling_params
.
max_new_tokens
,
CLIP_MAX_NEW_TOKENS
_ESTIMATION
,
CLIP_MAX_NEW_TOKENS
,
),
),
)
)
else
:
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment