Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2da403e3
Unverified
Commit
2da403e3
authored
Jan 23, 2026
by
Alec
Committed by
GitHub
Jan 23, 2026
Browse files
refactor: add explicit non-leader node handling in vLLM (#5597)
Signed-off-by:
alec-flowers
<
aflowers@nvidia.com
>
parent
50f1e0e1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
37 deletions
+52
-37
.github/workflows/pr.yaml
.github/workflows/pr.yaml
+1
-1
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+51
-36
No files found.
.github/workflows/pr.yaml
View file @
2da403e3
...
@@ -148,7 +148,7 @@ jobs:
...
@@ -148,7 +148,7 @@ jobs:
-
{
major_minor
:
'
12.9'
,
major
:
'
12'
}
-
{
major_minor
:
'
12.9'
,
major
:
'
12'
}
name
:
vllm-build-test (cuda${{ matrix.cuda_version.major_minor}}, ${{ matrix.platform.arch }})
name
:
vllm-build-test (cuda${{ matrix.cuda_version.major_minor}}, ${{ matrix.platform.arch }})
runs-on
:
${{ matrix.platform.runner }}
runs-on
:
${{ matrix.platform.runner }}
timeout-minutes
:
9
0
timeout-minutes
:
24
0
env
:
env
:
FRAMEWORK
:
vllm
FRAMEWORK
:
vllm
steps
:
&runtime-container-build-push-test
steps
:
&runtime-container-build-push-test
...
...
components/src/dynamo/vllm/main.py
View file @
2da403e3
...
@@ -48,6 +48,19 @@ configure_dynamo_logging()
...
@@ -48,6 +48,19 @@ configure_dynamo_logging()
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
async
def
_handle_non_leader_node
(
dp_rank
:
int
)
->
None
:
"""
Handle non-leader node (data_parallel_rank >= 1) in multi-node deployments.
Non-leader nodes run vLLM workers but don't serve Dynamo endpoints.
"""
logger
.
info
(
f
"Non-leader node detected (data_parallel_rank=
{
dp_rank
}
). "
"Skipping endpoint serving."
)
# Wait indefinitely - process terminated via signal handlers
await
asyncio
.
Event
().
wait
()
async
def
graceful_shutdown
(
runtime
):
async
def
graceful_shutdown
(
runtime
):
"""
"""
Shutdown dynamo distributed runtime.
Shutdown dynamo distributed runtime.
...
@@ -452,11 +465,13 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
...
@@ -452,11 +465,13 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
runtime
.
register_engine_route
(
"wake"
,
handler
.
wake
)
runtime
.
register_engine_route
(
"wake"
,
handler
.
wake
)
logger
.
info
(
"Registered engine routes: /engine/sleep, /engine/wake"
)
logger
.
info
(
"Registered engine routes: /engine/sleep, /engine/wake"
)
# Handle non-leader nodes - don't serve endpoints
if
config
.
engine_args
.
data_parallel_rank
:
await
_handle_non_leader_node
(
config
.
engine_args
.
data_parallel_rank
)
return
# Register prefill model with ModelType.Prefill
# Register prefill model with ModelType.Prefill
if
not
config
.
engine_args
.
data_parallel_rank
:
# if rank is 0 or None then register
model_input
=
ModelInput
.
Text
if
config
.
use_vllm_tokenizer
else
ModelInput
.
Tokens
model_input
=
(
ModelInput
.
Text
if
config
.
use_vllm_tokenizer
else
ModelInput
.
Tokens
)
await
register_vllm_model
(
await
register_vllm_model
(
model_input
,
model_input
,
ModelType
.
Prefill
,
ModelType
.
Prefill
,
...
@@ -575,16 +590,16 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -575,16 +590,16 @@ async def init(runtime: DistributedRuntime, config: Config):
runtime
.
register_engine_route
(
"wake"
,
handler
.
wake
)
runtime
.
register_engine_route
(
"wake"
,
handler
.
wake
)
logger
.
info
(
"Registered engine routes: /engine/sleep, /engine/wake"
)
logger
.
info
(
"Registered engine routes: /engine/sleep, /engine/wake"
)
if
not
config
.
engine_args
.
data_parallel_rank
:
# if rank is 0 or None then register
# Handle non-leader nodes - don't serve endpoints
if
config
.
engine_args
.
data_parallel_rank
:
await
_handle_non_leader_node
(
config
.
engine_args
.
data_parallel_rank
)
return
# Parse endpoint types from --dyn-endpoint-types flag
# Parse endpoint types from --dyn-endpoint-types flag
model_type
=
parse_endpoint_types
(
config
.
dyn_endpoint_types
)
model_type
=
parse_endpoint_types
(
config
.
dyn_endpoint_types
)
logger
.
info
(
logger
.
info
(
f
"Registering model with endpoint types:
{
config
.
dyn_endpoint_types
}
"
)
f
"Registering model with endpoint types:
{
config
.
dyn_endpoint_types
}
"
)
model_input
=
(
model_input
=
ModelInput
.
Text
if
config
.
use_vllm_tokenizer
else
ModelInput
.
Tokens
ModelInput
.
Text
if
config
.
use_vllm_tokenizer
else
ModelInput
.
Tokens
)
# Warn if custom template provided but chat endpoint not enabled
# Warn if custom template provided but chat endpoint not enabled
if
config
.
custom_jinja_template
and
"chat"
not
in
config
.
dyn_endpoint_types
:
if
config
.
custom_jinja_template
and
"chat"
not
in
config
.
dyn_endpoint_types
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment