Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
119149f2
Unverified
Commit
119149f2
authored
Apr 15, 2026
by
ishandhanani
Committed by
GitHub
Apr 15, 2026
Browse files
fix(sglang): always populate max_num_batched_tokens in MDC (#8220)
parent
134d484d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
22 deletions
+28
-22
components/src/dynamo/sglang/llm_engine.py
components/src/dynamo/sglang/llm_engine.py
+7
-3
components/src/dynamo/sglang/register.py
components/src/dynamo/sglang/register.py
+21
-19
No files found.
components/src/dynamo/sglang/llm_engine.py
View file @
119149f2
...
@@ -79,6 +79,12 @@ class SglangLLMEngine(LLMEngine):
...
@@ -79,6 +79,12 @@ class SglangLLMEngine(LLMEngine):
if
max_total_tokens
and
page_size
:
if
max_total_tokens
and
page_size
:
total_kv_blocks
=
(
max_total_tokens
+
page_size
-
1
)
//
page_size
total_kv_blocks
=
(
max_total_tokens
+
page_size
-
1
)
//
page_size
# Prefer explicit max_prefill_tokens; fall back to max_total_num_tokens
# from the scheduler so the planner always has a prefill load signal.
max_num_batched_tokens
=
(
getattr
(
self
.
server_args
,
"max_prefill_tokens"
,
None
)
or
max_total_tokens
)
return
EngineConfig
(
return
EngineConfig
(
model
=
self
.
server_args
.
model_path
,
model
=
self
.
server_args
.
model_path
,
served_model_name
=
self
.
server_args
.
served_model_name
,
served_model_name
=
self
.
server_args
.
served_model_name
,
...
@@ -86,9 +92,7 @@ class SglangLLMEngine(LLMEngine):
...
@@ -86,9 +92,7 @@ class SglangLLMEngine(LLMEngine):
kv_cache_block_size
=
page_size
,
kv_cache_block_size
=
page_size
,
total_kv_blocks
=
total_kv_blocks
,
total_kv_blocks
=
total_kv_blocks
,
max_num_seqs
=
getattr
(
self
.
server_args
,
"max_running_requests"
,
None
),
max_num_seqs
=
getattr
(
self
.
server_args
,
"max_running_requests"
,
None
),
max_num_batched_tokens
=
getattr
(
max_num_batched_tokens
=
max_num_batched_tokens
,
self
.
server_args
,
"max_prefill_tokens"
,
None
),
)
)
async
def
generate
(
async
def
generate
(
...
...
components/src/dynamo/sglang/register.py
View file @
119149f2
...
@@ -173,11 +173,8 @@ async def _get_runtime_config(
...
@@ -173,11 +173,8 @@ async def _get_runtime_config(
# Try to check if the engine has a scheduler attribute with the computed values
# Try to check if the engine has a scheduler attribute with the computed values
if
hasattr
(
engine
,
"scheduler_info"
)
and
engine
.
scheduler_info
is
not
None
:
if
hasattr
(
engine
,
"scheduler_info"
)
and
engine
.
scheduler_info
is
not
None
:
# Get max_total_num_tokens from scheduler_info
# Get max_total_num_tokens from scheduler_info
if
"max_total_num_tokens"
in
engine
.
scheduler_info
:
max_total_tokens
=
engine
.
scheduler_info
.
get
(
"max_total_num_tokens"
)
max_total_tokens
=
engine
.
scheduler_info
[
"max_total_num_tokens"
]
if
max_total_tokens
and
hasattr
(
engine
.
tokenizer_manager
,
"server_args"
):
if
max_total_tokens
and
hasattr
(
engine
.
tokenizer_manager
,
"server_args"
):
page_size
=
engine
.
tokenizer_manager
.
server_args
.
page_size
page_size
=
engine
.
tokenizer_manager
.
server_args
.
page_size
if
page_size
:
if
page_size
:
runtime_config
.
total_kv_blocks
=
(
runtime_config
.
total_kv_blocks
=
(
...
@@ -188,10 +185,15 @@ async def _get_runtime_config(
...
@@ -188,10 +185,15 @@ async def _get_runtime_config(
f
"(max_total_tokens=
{
max_total_tokens
}
, page_size=
{
page_size
}
)"
f
"(max_total_tokens=
{
max_total_tokens
}
, page_size=
{
page_size
}
)"
)
)
# Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info.
# When max_prefill_tokens is not explicitly set by the user, fall back
# SGLang separates configuration (server_args) from runtime stats (scheduler_info).
# to max_total_num_tokens from the scheduler. This ensures the planner
# In contrast, vLLM exposes both config and runtime values through engine config.
# always has a prefill load signal for aggregated scaling decisions.
# These are config parameters, so they must be retrieved from server_args only.
if
not
max_prefill_tokens
and
max_total_tokens
:
runtime_config
.
max_num_batched_tokens
=
max_total_tokens
logging
.
info
(
f
"max_prefill_tokens not set, using max_total_num_tokens "
f
"from scheduler as max_num_batched_tokens:
{
max_total_tokens
}
"
)
return
runtime_config
return
runtime_config
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment