Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
119149f2
"vscode:/vscode.git/clone" did not exist on "2a61e29e27178f87e5e7ff42fa9f07ef1e5b00e1"
Unverified
Commit
119149f2
authored
Apr 15, 2026
by
ishandhanani
Committed by
GitHub
Apr 15, 2026
Browse files
fix(sglang): always populate max_num_batched_tokens in MDC (#8220)
parent
134d484d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
22 deletions
+28
-22
components/src/dynamo/sglang/llm_engine.py
components/src/dynamo/sglang/llm_engine.py
+7
-3
components/src/dynamo/sglang/register.py
components/src/dynamo/sglang/register.py
+21
-19
No files found.
components/src/dynamo/sglang/llm_engine.py
View file @
119149f2
...
@@ -79,6 +79,12 @@ class SglangLLMEngine(LLMEngine):
...
@@ -79,6 +79,12 @@ class SglangLLMEngine(LLMEngine):
if
max_total_tokens
and
page_size
:
if
max_total_tokens
and
page_size
:
total_kv_blocks
=
(
max_total_tokens
+
page_size
-
1
)
//
page_size
total_kv_blocks
=
(
max_total_tokens
+
page_size
-
1
)
//
page_size
# Prefer explicit max_prefill_tokens; fall back to max_total_num_tokens
# from the scheduler so the planner always has a prefill load signal.
max_num_batched_tokens
=
(
getattr
(
self
.
server_args
,
"max_prefill_tokens"
,
None
)
or
max_total_tokens
)
return
EngineConfig
(
return
EngineConfig
(
model
=
self
.
server_args
.
model_path
,
model
=
self
.
server_args
.
model_path
,
served_model_name
=
self
.
server_args
.
served_model_name
,
served_model_name
=
self
.
server_args
.
served_model_name
,
...
@@ -86,9 +92,7 @@ class SglangLLMEngine(LLMEngine):
...
@@ -86,9 +92,7 @@ class SglangLLMEngine(LLMEngine):
kv_cache_block_size
=
page_size
,
kv_cache_block_size
=
page_size
,
total_kv_blocks
=
total_kv_blocks
,
total_kv_blocks
=
total_kv_blocks
,
max_num_seqs
=
getattr
(
self
.
server_args
,
"max_running_requests"
,
None
),
max_num_seqs
=
getattr
(
self
.
server_args
,
"max_running_requests"
,
None
),
max_num_batched_tokens
=
getattr
(
max_num_batched_tokens
=
max_num_batched_tokens
,
self
.
server_args
,
"max_prefill_tokens"
,
None
),
)
)
async
def
generate
(
async
def
generate
(
...
...
components/src/dynamo/sglang/register.py
View file @
119149f2
...
@@ -173,11 +173,8 @@ async def _get_runtime_config(
...
@@ -173,11 +173,8 @@ async def _get_runtime_config(
# Try to check if the engine has a scheduler attribute with the computed values
# Try to check if the engine has a scheduler attribute with the computed values
if
hasattr
(
engine
,
"scheduler_info"
)
and
engine
.
scheduler_info
is
not
None
:
if
hasattr
(
engine
,
"scheduler_info"
)
and
engine
.
scheduler_info
is
not
None
:
# Get max_total_num_tokens from scheduler_info
# Get max_total_num_tokens from scheduler_info
if
"max_total_num_tokens"
in
engine
.
scheduler_info
:
max_total_tokens
=
engine
.
scheduler_info
.
get
(
"max_total_num_tokens"
)
max_total_tokens
=
engine
.
scheduler_info
[
"max_total_num_tokens"
]
if
max_total_tokens
and
hasattr
(
engine
.
tokenizer_manager
,
"server_args"
):
if
max_total_tokens
and
hasattr
(
engine
.
tokenizer_manager
,
"server_args"
):
page_size
=
engine
.
tokenizer_manager
.
server_args
.
page_size
page_size
=
engine
.
tokenizer_manager
.
server_args
.
page_size
if
page_size
:
if
page_size
:
runtime_config
.
total_kv_blocks
=
(
runtime_config
.
total_kv_blocks
=
(
...
@@ -188,10 +185,15 @@ async def _get_runtime_config(
...
@@ -188,10 +185,15 @@ async def _get_runtime_config(
f
"(max_total_tokens=
{
max_total_tokens
}
, page_size=
{
page_size
}
)"
f
"(max_total_tokens=
{
max_total_tokens
}
, page_size=
{
page_size
}
)"
)
)
# Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info.
# When max_prefill_tokens is not explicitly set by the user, fall back
# SGLang separates configuration (server_args) from runtime stats (scheduler_info).
# to max_total_num_tokens from the scheduler. This ensures the planner
# In contrast, vLLM exposes both config and runtime values through engine config.
# always has a prefill load signal for aggregated scaling decisions.
# These are config parameters, so they must be retrieved from server_args only.
if
not
max_prefill_tokens
and
max_total_tokens
:
runtime_config
.
max_num_batched_tokens
=
max_total_tokens
logging
.
info
(
f
"max_prefill_tokens not set, using max_total_num_tokens "
f
"from scheduler as max_num_batched_tokens:
{
max_total_tokens
}
"
)
return
runtime_config
return
runtime_config
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment