Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
b94f9dcd
Unverified
Commit
b94f9dcd
authored
Feb 11, 2026
by
jh-nv
Committed by
GitHub
Feb 11, 2026
Browse files
feat: request migration for SGLang (#5659)
parent
2ec1c3f5
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
332 additions
and
75 deletions
+332
-75
components/src/dynamo/common/utils/__init__.py
components/src/dynamo/common/utils/__init__.py
+16
-2
components/src/dynamo/common/utils/engine_response.py
components/src/dynamo/common/utils/engine_response.py
+21
-0
components/src/dynamo/sglang/main.py
components/src/dynamo/sglang/main.py
+193
-28
components/src/dynamo/sglang/request_handlers/embedding/embedding_handler.py
...mo/sglang/request_handlers/embedding/embedding_handler.py
+3
-1
components/src/dynamo/sglang/request_handlers/handler_base.py
...onents/src/dynamo/sglang/request_handlers/handler_base.py
+43
-8
components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
.../src/dynamo/sglang/request_handlers/llm/decode_handler.py
+13
-3
components/src/dynamo/sglang/request_handlers/llm/diffusion_handler.py
...c/dynamo/sglang/request_handlers/llm/diffusion_handler.py
+7
-2
components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
...src/dynamo/sglang/request_handlers/llm/prefill_handler.py
+6
-2
components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
...lang/request_handlers/multimodal/encode_worker_handler.py
+6
-2
components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py
...o/sglang/request_handlers/multimodal/processor_handler.py
+6
-2
components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py
...namo/sglang/request_handlers/multimodal/worker_handler.py
+15
-5
components/src/dynamo/vllm/handlers.py
components/src/dynamo/vllm/handlers.py
+3
-20
No files found.
components/src/dynamo/common/utils/__init__.py
View file @
b94f9dcd
...
@@ -14,6 +14,20 @@ Submodules:
...
@@ -14,6 +14,20 @@ Submodules:
- prometheus: Prometheus metrics collection and logging utilities
- prometheus: Prometheus metrics collection and logging utilities
"""
"""
from
dynamo.common.utils
import
endpoint_types
,
otel_tracing
,
paths
,
prometheus
,
runtime
from
dynamo.common.utils
import
(
endpoint_types
,
engine_response
,
otel_tracing
,
paths
,
prometheus
,
runtime
,
)
__all__
=
[
"endpoint_types"
,
"otel_tracing"
,
"paths"
,
"prometheus"
,
"runtime"
]
__all__
=
[
"endpoint_types"
,
"engine_response"
,
"otel_tracing"
,
"paths"
,
"prometheus"
,
"runtime"
,
]
components/src/dynamo/common/utils/engine_response.py
0 → 100644
View file @
b94f9dcd
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Utilities for engine response processing."""
import
logging
def
normalize_finish_reason
(
finish_reason
:
str
)
->
str
:
"""
Normalize engine finish reasons to Dynamo-compatible values.
Engine may return finish reasons that aren't recognized by Dynamo's Rust layer.
This method maps them to compatible values.
[TODO]: Remove this method and add the right code in the Rust layer.
"""
# Map engine's "abort" to Dynamo's "cancelled"
if
finish_reason
and
finish_reason
.
startswith
(
"abort"
):
logging
.
debug
(
f
"Normalizing finish reason:
{
finish_reason
}
to cancelled"
)
return
"cancelled"
return
finish_reason
components/src/dynamo/sglang/main.py
View file @
b94f9dcd
...
@@ -2,10 +2,14 @@
...
@@ -2,10 +2,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
asyncio
import
inspect
import
logging
import
logging
import
os
import
os
import
signal
import
sys
import
sys
import
time
import
time
from
collections
import
defaultdict
from
typing
import
Any
,
Awaitable
,
Callable
,
DefaultDict
import
sglang
as
sgl
import
sglang
as
sgl
import
uvloop
import
uvloop
...
@@ -43,6 +47,8 @@ from dynamo.sglang.request_handlers import (
...
@@ -43,6 +47,8 @@ from dynamo.sglang.request_handlers import (
configure_dynamo_logging
()
configure_dynamo_logging
()
RUN_DEFERRED_HANDLERS
:
Callable
[[],
Awaitable
[
None
]]
|
None
=
None
async
def
_handle_non_leader_node
(
async
def
_handle_non_leader_node
(
engine
:
sgl
.
Engine
,
engine
:
sgl
.
Engine
,
...
@@ -80,6 +86,108 @@ async def _handle_non_leader_node(
...
@@ -80,6 +86,108 @@ async def _handle_non_leader_node(
publisher
.
cleanup
()
publisher
.
cleanup
()
SignalCallback
=
Callable
[...,
Any
]
def
install_graceful_shutdown
(
loop
:
asyncio
.
AbstractEventLoop
,
runtime
:
Any
,
*
,
signals
:
tuple
[
int
,
...]
=
(
signal
.
SIGTERM
,
signal
.
SIGINT
),
)
->
tuple
[
asyncio
.
Event
,
Callable
[[],
Awaitable
[
None
]]]:
"""
Set up graceful shutdown + callback chaining.
What it does:
- Owns OS-level SIGTERM/SIGINT via signal.signal(...)
- Captures (suppresses) loop.add_signal_handler(SIGTERM/SIGINT, ...) registrations
and runs them during shutdown (sync or async)
- Calls runtime.shutdown() during shutdown (sync or async)
- Sets and returns an asyncio.Event you can await to know shutdown was requested
Returns:
(shutdown_event, run_deferred_handlers)
"""
shutdown_event
=
asyncio
.
Event
()
# Deferred handlers registered via loop.add_signal_handler for these signals
deferred_handlers
:
DefaultDict
[
int
,
list
[
tuple
[
SignalCallback
,
tuple
[
Any
,
...]]]]
=
defaultdict
(
list
)
# type: ignore[assignment]
# Previous OS handlers (for optional chaining)
old_os_handlers
:
dict
[
int
,
Any
]
=
{}
shutdown_started
=
False
shutdown_signum
:
int
|
None
=
None
deferred_handlers_ran
=
False
async
def
run_deferred_handlers
()
->
None
:
nonlocal
deferred_handlers_ran
if
not
shutdown_started
or
deferred_handlers_ran
:
return
deferred_handlers_ran
=
True
signums
=
(
[
shutdown_signum
]
if
shutdown_signum
is
not
None
else
list
(
deferred_handlers
.
keys
())
)
for
sig
in
signums
:
for
cb
,
args
in
list
(
deferred_handlers
.
get
(
sig
,
[])):
try
:
res
=
cb
(
*
args
)
if
inspect
.
isawaitable
(
res
):
await
res
except
Exception
:
logging
.
exception
(
"Deferred signal callback failed: %r"
,
cb
)
async
def
_shutdown_sequence
(
signum
:
int
,
frame
:
Any
|
None
)
->
None
:
nonlocal
shutdown_started
,
shutdown_signum
if
shutdown_started
:
return
shutdown_signum
=
signum
shutdown_started
=
True
logging
.
info
(
"Received signal %s, starting graceful shutdown"
,
signum
)
shutdown_event
.
set
()
try
:
runtime
.
shutdown
()
except
Exception
:
logging
.
exception
(
"runtime.shutdown() failed"
)
def
_schedule_shutdown
(
signum
:
int
,
frame
:
Any
|
None
)
->
None
:
def
_kick
()
->
None
:
asyncio
.
create_task
(
_shutdown_sequence
(
signum
,
frame
))
loop
.
call_soon_threadsafe
(
_kick
)
def
_os_signal_handler
(
signum
:
int
,
frame
:
Any
)
->
None
:
# Keep the OS handler tiny; do real work in the loop thread.
_schedule_shutdown
(
signum
,
frame
)
# Install OS-level handlers
for
sig
in
signals
:
old_os_handlers
[
sig
]
=
signal
.
signal
(
sig
,
_os_signal_handler
)
# Intercept loop.add_signal_handler for SIGTERM/SIGINT and defer them
orig_add
=
loop
.
add_signal_handler
def
watching_add_signal_handler
(
sig
:
int
,
callback
:
SignalCallback
,
*
args
:
Any
):
if
sig
in
signals
:
logging
.
info
(
"Captured loop.add_signal_handler(%s, %r, ...) (deferred)."
,
sig
,
callback
,
)
deferred_handlers
[
sig
].
append
((
callback
,
args
))
return
None
return
orig_add
(
sig
,
callback
,
*
args
)
loop
.
add_signal_handler
=
watching_add_signal_handler
# type: ignore[assignment]
return
shutdown_event
,
run_deferred_handlers
async
def
worker
():
async
def
worker
():
config
=
await
parse_args
(
sys
.
argv
[
1
:])
config
=
await
parse_args
(
sys
.
argv
[
1
:])
dump_config
(
config
.
dynamo_args
.
dump_config_to
,
config
)
dump_config
(
config
.
dynamo_args
.
dump_config_to
,
config
)
...
@@ -91,36 +199,42 @@ async def worker():
...
@@ -91,36 +199,42 @@ async def worker():
config
.
server_args
.
load_format
=
setup_gms
(
config
.
server_args
)
config
.
server_args
.
load_format
=
setup_gms
(
config
.
server_args
)
dynamo_args
=
config
.
dynamo_args
dynamo_args
=
config
.
dynamo_args
runtime
,
_
=
create_runtime
(
runtime
,
loop
=
create_runtime
(
store_kv
=
dynamo_args
.
store_kv
,
store_kv
=
dynamo_args
.
store_kv
,
request_plane
=
dynamo_args
.
request_plane
,
request_plane
=
dynamo_args
.
request_plane
,
event_plane
=
dynamo_args
.
event_plane
,
event_plane
=
dynamo_args
.
event_plane
,
use_kv_events
=
dynamo_args
.
use_kv_events
,
use_kv_events
=
dynamo_args
.
use_kv_events
,
)
)
# Set up signal handlers using signal module to allow chaining
global
RUN_DEFERRED_HANDLERS
shutdown_event
,
RUN_DEFERRED_HANDLERS
=
install_graceful_shutdown
(
loop
,
runtime
)
logging
.
info
(
"Signal handlers set up for graceful shutdown (with chaining)"
)
if
config
.
dynamo_args
.
image_diffusion_worker
:
if
config
.
dynamo_args
.
image_diffusion_worker
:
await
init_image_diffusion
(
runtime
,
config
)
await
init_image_diffusion
(
runtime
,
config
)
elif
config
.
dynamo_args
.
embedding_worker
:
elif
config
.
dynamo_args
.
embedding_worker
:
await
init_embedding
(
runtime
,
config
)
await
init_embedding
(
runtime
,
config
,
shutdown_event
)
elif
config
.
dynamo_args
.
multimodal_processor
:
elif
config
.
dynamo_args
.
multimodal_processor
:
await
init_multimodal_processor
(
runtime
,
config
)
await
init_multimodal_processor
(
runtime
,
config
,
shutdown_event
)
elif
config
.
dynamo_args
.
multimodal_encode_worker
:
elif
config
.
dynamo_args
.
multimodal_encode_worker
:
await
init_multimodal_encode_worker
(
runtime
,
config
)
await
init_multimodal_encode_worker
(
runtime
,
config
,
shutdown_event
)
elif
config
.
dynamo_args
.
multimodal_worker
:
elif
config
.
dynamo_args
.
multimodal_worker
:
if
config
.
serving_mode
!=
DisaggregationMode
.
PREFILL
:
if
config
.
serving_mode
!=
DisaggregationMode
.
PREFILL
:
await
init_multimodal_worker
(
runtime
,
config
)
await
init_multimodal_worker
(
runtime
,
config
,
shutdown_event
)
else
:
else
:
await
init_multimodal_prefill_worker
(
runtime
,
config
)
await
init_multimodal_prefill_worker
(
runtime
,
config
,
shutdown_event
)
elif
config
.
dynamo_args
.
diffusion_worker
:
elif
config
.
dynamo_args
.
diffusion_worker
:
await
init_diffusion
(
runtime
,
config
)
await
init_diffusion
(
runtime
,
config
,
shutdown_event
)
elif
config
.
serving_mode
!=
DisaggregationMode
.
PREFILL
:
elif
config
.
serving_mode
!=
DisaggregationMode
.
PREFILL
:
await
init
(
runtime
,
config
)
await
init
(
runtime
,
config
,
shutdown_event
)
else
:
else
:
await
init_prefill
(
runtime
,
config
)
await
init_prefill
(
runtime
,
config
,
shutdown_event
)
async
def
init
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init
(
runtime
:
DistributedRuntime
,
config
:
Config
,
shutdown_event
:
asyncio
.
Event
):
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
# Prevent SGLang from blocking on non-leader nodes
# Prevent SGLang from blocking on non-leader nodes
...
@@ -158,7 +272,7 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -158,7 +272,7 @@ async def init(runtime: DistributedRuntime, config: Config):
ready_event
=
asyncio
.
Event
()
ready_event
=
asyncio
.
Event
()
handler
=
DecodeWorkerHandler
(
handler
=
DecodeWorkerHandler
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
component
,
engine
,
config
,
publisher
,
generate_endpoint
,
shutdown_event
)
)
handler
.
register_engine_routes
(
runtime
)
handler
.
register_engine_routes
(
runtime
)
...
@@ -205,12 +319,17 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -205,12 +319,17 @@ async def init(runtime: DistributedRuntime, config: Config):
try
:
try
:
await
metrics_task
await
metrics_task
except
asyncio
.
CancelledError
:
except
asyncio
.
CancelledError
:
logging
.
info
(
"Metrics task succesfully cancelled"
)
logging
.
info
(
"Metrics task succes
s
fully cancelled"
)
pass
pass
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
init_prefill
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init_prefill
(
runtime
:
DistributedRuntime
,
config
:
Config
,
shutdown_event
:
asyncio
.
Event
):
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
# Prevent SGLang from blocking on non-leader nodes
# Prevent SGLang from blocking on non-leader nodes
...
@@ -242,7 +361,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
...
@@ -242,7 +361,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
await
_warmup_prefill_engine
(
engine
,
server_args
)
await
_warmup_prefill_engine
(
engine
,
server_args
)
handler
=
PrefillWorkerHandler
(
handler
=
PrefillWorkerHandler
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
component
,
engine
,
config
,
publisher
,
generate_endpoint
,
shutdown_event
)
)
handler
.
register_engine_routes
(
runtime
)
handler
.
register_engine_routes
(
runtime
)
...
@@ -282,9 +401,14 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
...
@@ -282,9 +401,14 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
logging
.
info
(
"Metrics task successfully cancelled"
)
logging
.
info
(
"Metrics task successfully cancelled"
)
pass
pass
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
init_diffusion
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init_diffusion
(
runtime
:
DistributedRuntime
,
config
:
Config
,
shutdown_event
:
asyncio
.
Event
):
"""Initialize diffusion language model worker component"""
"""Initialize diffusion language model worker component"""
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
...
@@ -324,7 +448,7 @@ async def init_diffusion(runtime: DistributedRuntime, config: Config):
...
@@ -324,7 +448,7 @@ async def init_diffusion(runtime: DistributedRuntime, config: Config):
ready_event
=
asyncio
.
Event
()
ready_event
=
asyncio
.
Event
()
handler
=
DiffusionWorkerHandler
(
handler
=
DiffusionWorkerHandler
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
component
,
engine
,
config
,
publisher
,
generate_endpoint
,
shutdown_event
)
)
handler
.
register_engine_routes
(
runtime
)
handler
.
register_engine_routes
(
runtime
)
...
@@ -365,9 +489,14 @@ async def init_diffusion(runtime: DistributedRuntime, config: Config):
...
@@ -365,9 +489,14 @@ async def init_diffusion(runtime: DistributedRuntime, config: Config):
logging
.
info
(
"Metrics task successfully cancelled"
)
logging
.
info
(
"Metrics task successfully cancelled"
)
pass
pass
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
init_embedding
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init_embedding
(
runtime
:
DistributedRuntime
,
config
:
Config
,
shutdown_event
:
asyncio
.
Event
):
"""Initialize embedding worker component"""
"""Initialize embedding worker component"""
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
...
@@ -387,7 +516,9 @@ async def init_embedding(runtime: DistributedRuntime, config: Config):
...
@@ -387,7 +516,9 @@ async def init_embedding(runtime: DistributedRuntime, config: Config):
# Readiness gate: requests wait until model is registered
# Readiness gate: requests wait until model is registered
ready_event
=
asyncio
.
Event
()
ready_event
=
asyncio
.
Event
()
handler
=
EmbeddingWorkerHandler
(
component
,
engine
,
config
,
publisher
)
handler
=
EmbeddingWorkerHandler
(
component
,
engine
,
config
,
publisher
,
shutdown_event
)
health_check_payload
=
SglangHealthCheckPayload
(
health_check_payload
=
SglangHealthCheckPayload
(
engine
,
use_text_input
=
dynamo_args
.
use_sglang_tokenizer
engine
,
use_text_input
=
dynamo_args
.
use_sglang_tokenizer
).
to_dict
()
).
to_dict
()
...
@@ -423,6 +554,9 @@ async def init_embedding(runtime: DistributedRuntime, config: Config):
...
@@ -423,6 +554,9 @@ async def init_embedding(runtime: DistributedRuntime, config: Config):
logging
.
info
(
"Metrics task successfully cancelled"
)
logging
.
info
(
"Metrics task successfully cancelled"
)
pass
pass
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
init_image_diffusion
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init_image_diffusion
(
runtime
:
DistributedRuntime
,
config
:
Config
):
...
@@ -504,9 +638,14 @@ async def init_image_diffusion(runtime: DistributedRuntime, config: Config):
...
@@ -504,9 +638,14 @@ async def init_image_diffusion(runtime: DistributedRuntime, config: Config):
raise
raise
finally
:
finally
:
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
init_multimodal_processor
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init_multimodal_processor
(
runtime
:
DistributedRuntime
,
config
:
Config
,
shutdown_event
:
asyncio
.
Event
):
"""Initialize multimodal processor component"""
"""Initialize multimodal processor component"""
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
component
=
runtime
.
namespace
(
dynamo_args
.
namespace
).
component
(
component
=
runtime
.
namespace
(
dynamo_args
.
namespace
).
component
(
...
@@ -525,7 +664,9 @@ async def init_multimodal_processor(runtime: DistributedRuntime, config: Config)
...
@@ -525,7 +664,9 @@ async def init_multimodal_processor(runtime: DistributedRuntime, config: Config)
ready_event
=
asyncio
.
Event
()
ready_event
=
asyncio
.
Event
()
handler
=
MultimodalProcessorHandler
(
component
,
config
,
encode_worker_client
)
handler
=
MultimodalProcessorHandler
(
component
,
config
,
encode_worker_client
,
shutdown_event
)
logging
.
info
(
"Waiting for Encoder Worker Instances ..."
)
logging
.
info
(
"Waiting for Encoder Worker Instances ..."
)
await
encode_worker_client
.
wait_for_instances
()
await
encode_worker_client
.
wait_for_instances
()
...
@@ -554,9 +695,14 @@ async def init_multimodal_processor(runtime: DistributedRuntime, config: Config)
...
@@ -554,9 +695,14 @@ async def init_multimodal_processor(runtime: DistributedRuntime, config: Config)
raise
raise
finally
:
finally
:
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
init_multimodal_encode_worker
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init_multimodal_encode_worker
(
runtime
:
DistributedRuntime
,
config
:
Config
,
shutdown_event
:
asyncio
.
Event
):
"""Initialize multimodal encode worker component"""
"""Initialize multimodal encode worker component"""
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
...
@@ -574,7 +720,9 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con
...
@@ -574,7 +720,9 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con
.
client
()
.
client
()
)
)
handler
=
MultimodalEncodeWorkerHandler
(
component
,
config
,
pd_worker_client
)
handler
=
MultimodalEncodeWorkerHandler
(
component
,
config
,
pd_worker_client
,
shutdown_event
)
await
handler
.
async_init
(
runtime
)
await
handler
.
async_init
(
runtime
)
await
pd_worker_client
.
wait_for_instances
()
await
pd_worker_client
.
wait_for_instances
()
...
@@ -595,9 +743,14 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con
...
@@ -595,9 +743,14 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con
raise
raise
finally
:
finally
:
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
init_multimodal_worker
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init_multimodal_worker
(
runtime
:
DistributedRuntime
,
config
:
Config
,
shutdown_event
:
asyncio
.
Event
):
"""Initialize multimodal worker component.
"""Initialize multimodal worker component.
This worker is always an internal component that should not register with
This worker is always an internal component that should not register with
...
@@ -622,9 +775,13 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
...
@@ -622,9 +775,13 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
.
endpoint
(
"generate"
)
.
endpoint
(
"generate"
)
.
client
()
.
client
()
)
)
handler
=
MultimodalWorkerHandler
(
component
,
engine
,
config
,
prefill_client
)
handler
=
MultimodalWorkerHandler
(
component
,
engine
,
config
,
prefill_client
,
shutdown_event
)
else
:
else
:
handler
=
MultimodalWorkerHandler
(
component
,
engine
,
config
)
handler
=
MultimodalWorkerHandler
(
component
,
engine
,
config
,
None
,
shutdown_event
)
await
handler
.
async_init
()
await
handler
.
async_init
()
...
@@ -644,9 +801,14 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
...
@@ -644,9 +801,14 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
raise
raise
finally
:
finally
:
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
init_multimodal_prefill_worker
(
runtime
:
DistributedRuntime
,
config
:
Config
):
async
def
init_multimodal_prefill_worker
(
runtime
:
DistributedRuntime
,
config
:
Config
,
shutdown_event
:
asyncio
.
Event
):
"""Initialize multimodal prefill worker component"""
"""Initialize multimodal prefill worker component"""
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
server_args
,
dynamo_args
=
config
.
server_args
,
config
.
dynamo_args
...
@@ -658,7 +820,7 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
...
@@ -658,7 +820,7 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
generate_endpoint
=
component
.
endpoint
(
dynamo_args
.
endpoint
)
generate_endpoint
=
component
.
endpoint
(
dynamo_args
.
endpoint
)
handler
=
MultimodalPrefillWorkerHandler
(
component
,
engine
,
config
)
handler
=
MultimodalPrefillWorkerHandler
(
component
,
engine
,
config
,
shutdown_event
)
await
handler
.
async_init
()
await
handler
.
async_init
()
health_check_payload
=
SglangPrefillHealthCheckPayload
(
engine
).
to_dict
()
health_check_payload
=
SglangPrefillHealthCheckPayload
(
engine
).
to_dict
()
...
@@ -677,6 +839,9 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
...
@@ -677,6 +839,9 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
raise
raise
finally
:
finally
:
handler
.
cleanup
()
handler
.
cleanup
()
if
RUN_DEFERRED_HANDLERS
is
not
None
:
logging
.
info
(
"Running deferred handlers"
)
await
RUN_DEFERRED_HANDLERS
()
async
def
_warmup_prefill_engine
(
engine
:
sgl
.
Engine
,
server_args
)
->
None
:
async
def
_warmup_prefill_engine
(
engine
:
sgl
.
Engine
,
server_args
)
->
None
:
...
...
components/src/dynamo/sglang/request_handlers/embedding/embedding_handler.py
View file @
b94f9dcd
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
logging
import
logging
from
typing
import
Optional
from
typing
import
Optional
...
@@ -20,8 +21,9 @@ class EmbeddingWorkerHandler(BaseWorkerHandler):
...
@@ -20,8 +21,9 @@ class EmbeddingWorkerHandler(BaseWorkerHandler):
engine
:
sgl
.
Engine
,
engine
:
sgl
.
Engine
,
config
:
Config
,
config
:
Config
,
publisher
:
Optional
[
DynamoSglangPublisher
]
=
None
,
publisher
:
Optional
[
DynamoSglangPublisher
]
=
None
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
):
):
super
().
__init__
(
component
,
engine
,
config
,
publisher
)
super
().
__init__
(
component
,
engine
,
config
,
publisher
,
None
,
shutdown_event
)
logging
.
info
(
"Embedding worker handler initialized"
)
logging
.
info
(
"Embedding worker handler initialized"
)
def
cleanup
(
self
):
def
cleanup
(
self
):
...
...
components/src/dynamo/sglang/request_handlers/handler_base.py
View file @
b94f9dcd
...
@@ -102,6 +102,7 @@ class BaseWorkerHandler(BaseGenerativeHandler):
...
@@ -102,6 +102,7 @@ class BaseWorkerHandler(BaseGenerativeHandler):
config
:
Config
,
config
:
Config
,
publisher
:
Optional
[
DynamoSglangPublisher
]
=
None
,
publisher
:
Optional
[
DynamoSglangPublisher
]
=
None
,
generate_endpoint
=
None
,
generate_endpoint
=
None
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
)
->
None
:
)
->
None
:
"""Initialize base worker handler.
"""Initialize base worker handler.
...
@@ -111,6 +112,7 @@ class BaseWorkerHandler(BaseGenerativeHandler):
...
@@ -111,6 +112,7 @@ class BaseWorkerHandler(BaseGenerativeHandler):
config: SGLang and Dynamo configuration.
config: SGLang and Dynamo configuration.
publisher: Optional metrics publisher for the worker.
publisher: Optional metrics publisher for the worker.
generate_endpoint: The endpoint handle for discovery registration.
generate_endpoint: The endpoint handle for discovery registration.
shutdown_event: Optional event to signal shutdown.
"""
"""
# Call parent constructor
# Call parent constructor
super
().
__init__
(
component
,
config
,
publisher
)
super
().
__init__
(
component
,
config
,
publisher
)
...
@@ -120,6 +122,7 @@ class BaseWorkerHandler(BaseGenerativeHandler):
...
@@ -120,6 +122,7 @@ class BaseWorkerHandler(BaseGenerativeHandler):
self
.
config
=
config
self
.
config
=
config
self
.
generate_endpoint
=
generate_endpoint
self
.
generate_endpoint
=
generate_endpoint
self
.
publisher
=
publisher
self
.
publisher
=
publisher
self
.
shutdown_event
=
shutdown_event
if
publisher
is
not
None
:
if
publisher
is
not
None
:
self
.
metrics_publisher
=
publisher
.
metrics_publisher
self
.
metrics_publisher
=
publisher
.
metrics_publisher
self
.
kv_publisher
=
publisher
.
kv_publisher
self
.
kv_publisher
=
publisher
.
kv_publisher
...
@@ -436,12 +439,15 @@ class BaseWorkerHandler(BaseGenerativeHandler):
...
@@ -436,12 +439,15 @@ class BaseWorkerHandler(BaseGenerativeHandler):
async
def
_handle_cancellation
(
async
def
_handle_cancellation
(
self
,
request_id_future
:
asyncio
.
Future
,
context
:
Context
self
,
request_id_future
:
asyncio
.
Future
,
context
:
Context
):
):
"""Background task to handle cancellation by monitoring
context state
.
"""Background task to handle cancellation
and shutdown
by monitoring
both signals
.
Args:
Args:
request_id_future: Future that will be set with the SGLang request ID
request_id_future: Future that will be set with the SGLang request ID
when the first response arrives.
when the first response arrives.
context: Context object for cancellation handling.
context: Context object for cancellation handling.
Raises:
GeneratorExit: If shutdown event was triggered.
"""
"""
try
:
try
:
logging
.
debug
(
f
"Cancellation monitor started for Context:
{
context
.
id
()
}
"
)
logging
.
debug
(
f
"Cancellation monitor started for Context:
{
context
.
id
()
}
"
)
...
@@ -453,10 +459,34 @@ class BaseWorkerHandler(BaseGenerativeHandler):
...
@@ -453,10 +459,34 @@ class BaseWorkerHandler(BaseGenerativeHandler):
)
)
logging
.
debug
(
f
"Request ID future cancelled for Context:
{
context
.
id
()
}
"
)
logging
.
debug
(
f
"Request ID future cancelled for Context:
{
context
.
id
()
}
"
)
await
context
.
async_killed_or_stopped
()
# Get the cancellation future
cancellation_future
=
context
.
async_killed_or_stopped
()
# Build list of futures/tasks to wait for
wait_for
=
[
cancellation_future
]
shutdown_task
=
None
if
self
.
shutdown_event
:
# Create task for shutdown monitoring and add to wait list
shutdown_task
=
asyncio
.
create_task
(
self
.
shutdown_event
.
wait
())
wait_for
.
append
(
shutdown_task
)
# Wait for whichever happens first
done
,
pending
=
await
asyncio
.
wait
(
wait_for
,
return_when
=
asyncio
.
FIRST_COMPLETED
,
)
# Cancel the pending task/future
for
task
in
pending
:
task
.
cancel
()
try
:
await
task
except
asyncio
.
CancelledError
:
pass
logging
.
info
(
logging
.
info
(
f
"Cancellation signal received for SGLang Request ID
{
sglang_request_id
}
, Context:
{
context
.
id
()
}
"
f
"Cancellation
or shutdown
signal received for SGLang Request ID
{
sglang_request_id
}
, Context:
{
context
.
id
()
}
"
)
)
# Call abort_request on the tokenizer_manager through the engine
# Call abort_request on the tokenizer_manager through the engine
...
@@ -475,6 +505,11 @@ class BaseWorkerHandler(BaseGenerativeHandler):
...
@@ -475,6 +505,11 @@ class BaseWorkerHandler(BaseGenerativeHandler):
logging
.
error
(
logging
.
error
(
f
"SGLang tokenizer_manager not found for abort request:
{
context
.
id
()
}
"
f
"SGLang tokenizer_manager not found for abort request:
{
context
.
id
()
}
"
)
)
# Check which event triggered and raise GeneratorExit if shutdown
if
shutdown_task
and
shutdown_task
in
done
:
raise
GeneratorExit
(
"Engine was shut down during token generation"
)
except
asyncio
.
CancelledError
:
except
asyncio
.
CancelledError
:
# Task was cancelled, which is expected when generation completes
# Task was cancelled, which is expected when generation completes
request_id
=
"unknown"
request_id
=
"unknown"
...
@@ -493,9 +528,11 @@ class BaseWorkerHandler(BaseGenerativeHandler):
...
@@ -493,9 +528,11 @@ class BaseWorkerHandler(BaseGenerativeHandler):
self
,
request_id_future
:
asyncio
.
Future
,
context
:
Context
self
,
request_id_future
:
asyncio
.
Future
,
context
:
Context
)
->
AsyncGenerator
[
asyncio
.
Task
,
None
]:
)
->
AsyncGenerator
[
asyncio
.
Task
,
None
]:
"""
"""
Context manager for monitoring request cancellation.
Context manager for monitoring request cancellation
and shutdown
.
Automatically creates a background task to monitor for cancellation and
Automatically creates a background task to monitor for cancellation and
cleans it up when the context exits.
shutdown events, cleaning it up when the context exits.
If shutdown event was triggered, raises GeneratorExit on exit.
Args:
Args:
request_id_future: Future that will be set with the SGLang request ID
request_id_future: Future that will be set with the SGLang request ID
...
@@ -533,6 +570,4 @@ class BaseWorkerHandler(BaseGenerativeHandler):
...
@@ -533,6 +570,4 @@ class BaseWorkerHandler(BaseGenerativeHandler):
except
asyncio
.
CancelledError
:
except
asyncio
.
CancelledError
:
pass
pass
else
:
else
:
logging
.
debug
(
cancellation_task
.
result
()
f
"Cancellation monitor task already completed for SGLang Request ID
{
request_id
}
, Context:
{
context
.
id
()
}
"
)
components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
View file @
b94f9dcd
...
@@ -4,11 +4,12 @@
...
@@ -4,11 +4,12 @@
import
asyncio
import
asyncio
import
logging
import
logging
import
time
import
time
from
typing
import
Any
,
AsyncGenerator
,
Dict
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
Optional
import
sglang
as
sgl
import
sglang
as
sgl
from
dynamo._core
import
Component
,
Context
from
dynamo._core
import
Component
,
Context
from
dynamo.common.utils.engine_response
import
normalize_finish_reason
from
dynamo.sglang.args
import
Config
,
DisaggregationMode
from
dynamo.sglang.args
import
Config
,
DisaggregationMode
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.request_handlers.handler_base
import
BaseWorkerHandler
from
dynamo.sglang.request_handlers.handler_base
import
BaseWorkerHandler
...
@@ -24,6 +25,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -24,6 +25,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
config
:
Config
,
config
:
Config
,
publisher
:
DynamoSglangPublisher
,
publisher
:
DynamoSglangPublisher
,
generate_endpoint
=
None
,
generate_endpoint
=
None
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
)
->
None
:
)
->
None
:
"""Initialize decode worker handler.
"""Initialize decode worker handler.
...
@@ -32,6 +34,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -32,6 +34,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
engine: The SGLang engine instance.
engine: The SGLang engine instance.
config: SGLang and Dynamo configuration.
config: SGLang and Dynamo configuration.
publisher: Metrics publisher for the worker.
publisher: Metrics publisher for the worker.
shutdown_event: Optional event to signal shutdown.
generate_endpoint: The endpoint handle for discovery registration.
generate_endpoint: The endpoint handle for discovery registration.
"""
"""
super
().
__init__
(
super
().
__init__
(
...
@@ -40,6 +43,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -40,6 +43,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
config
,
config
,
publisher
,
publisher
,
generate_endpoint
,
generate_endpoint
,
shutdown_event
,
)
)
if
self
.
serving_mode
==
DisaggregationMode
.
DECODE
:
if
self
.
serving_mode
==
DisaggregationMode
.
DECODE
:
logging
.
info
(
logging
.
info
(
...
@@ -222,7 +226,9 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -222,7 +226,9 @@ class DecodeWorkerHandler(BaseWorkerHandler):
out
=
{}
out
=
{}
finish_reason
=
res
[
"meta_info"
][
"finish_reason"
]
finish_reason
=
res
[
"meta_info"
][
"finish_reason"
]
if
finish_reason
:
if
finish_reason
:
out
[
"finish_reason"
]
=
finish_reason
[
"type"
]
out
[
"finish_reason"
]
=
normalize_finish_reason
(
finish_reason
[
"type"
]
)
# With stream_output=True, output_ids contains only new tokens (disjoint)
# With stream_output=True, output_ids contains only new tokens (disjoint)
output_ids
=
res
.
get
(
"output_ids"
,
[])
output_ids
=
res
.
get
(
"output_ids"
,
[])
...
@@ -287,7 +293,11 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -287,7 +293,11 @@ class DecodeWorkerHandler(BaseWorkerHandler):
text
=
res
.
get
(
"text"
,
""
)
text
=
res
.
get
(
"text"
,
""
)
finish_reason
=
res
[
"meta_info"
][
"finish_reason"
]
finish_reason
=
res
[
"meta_info"
][
"finish_reason"
]
finish_reason_type
=
finish_reason
[
"type"
]
if
finish_reason
else
None
finish_reason_type
=
(
normalize_finish_reason
(
finish_reason
[
"type"
])
if
finish_reason
else
None
)
next_count
=
len
(
text
)
next_count
=
len
(
text
)
delta
=
text
[
count
:]
delta
=
text
[
count
:]
...
...
components/src/dynamo/sglang/request_handlers/llm/diffusion_handler.py
View file @
b94f9dcd
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
logging
import
logging
from
typing
import
Any
,
AsyncGenerator
,
Dict
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
Optional
import
sglang
as
sgl
import
sglang
as
sgl
...
@@ -23,6 +24,7 @@ class DiffusionWorkerHandler(DecodeWorkerHandler):
...
@@ -23,6 +24,7 @@ class DiffusionWorkerHandler(DecodeWorkerHandler):
config
:
Config
,
config
:
Config
,
publisher
:
DynamoSglangPublisher
=
None
,
publisher
:
DynamoSglangPublisher
=
None
,
generate_endpoint
=
None
,
generate_endpoint
=
None
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
)
->
None
:
)
->
None
:
"""Initialize diffusion worker handler.
"""Initialize diffusion worker handler.
...
@@ -32,8 +34,11 @@ class DiffusionWorkerHandler(DecodeWorkerHandler):
...
@@ -32,8 +34,11 @@ class DiffusionWorkerHandler(DecodeWorkerHandler):
config: SGLang and Dynamo configuration.
config: SGLang and Dynamo configuration.
publisher: Optional metrics publisher.
publisher: Optional metrics publisher.
generate_endpoint: The endpoint handle for discovery.
generate_endpoint: The endpoint handle for discovery.
shutdown_event: Optional event to signal shutdown.
"""
"""
super
().
__init__
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
)
super
().
__init__
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
,
shutdown_event
)
# Validate that diffusion algorithm is configured
# Validate that diffusion algorithm is configured
if
(
if
(
...
...
components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
View file @
b94f9dcd
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
asyncio
import
asyncio
import
logging
import
logging
from
typing
import
Any
,
AsyncGenerator
,
Dict
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
Optional
import
sglang
as
sgl
import
sglang
as
sgl
...
@@ -23,6 +23,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
...
@@ -23,6 +23,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
config
:
Config
,
config
:
Config
,
publisher
:
DynamoSglangPublisher
,
publisher
:
DynamoSglangPublisher
,
generate_endpoint
=
None
,
generate_endpoint
=
None
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
)
->
None
:
)
->
None
:
"""Initialize prefill worker handler.
"""Initialize prefill worker handler.
...
@@ -32,10 +33,13 @@ class PrefillWorkerHandler(BaseWorkerHandler):
...
@@ -32,10 +33,13 @@ class PrefillWorkerHandler(BaseWorkerHandler):
config: SGLang and Dynamo configuration.
config: SGLang and Dynamo configuration.
publisher: The SGLang publisher instance.
publisher: The SGLang publisher instance.
generate_endpoint: The endpoint handle for discovery registration.
generate_endpoint: The endpoint handle for discovery registration.
shutdown_event: Optional event to signal shutdown.
"""
"""
self
.
engine
=
engine
self
.
engine
=
engine
self
.
bootstrap_host
,
self
.
bootstrap_port
=
self
.
_get_bootstrap_info
(
self
.
engine
)
self
.
bootstrap_host
,
self
.
bootstrap_port
=
self
.
_get_bootstrap_info
(
self
.
engine
)
super
().
__init__
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
)
super
().
__init__
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
,
shutdown_event
)
self
.
_consume_tasks
=
set
()
self
.
_consume_tasks
=
set
()
logging
.
info
(
logging
.
info
(
f
"Prefill worker handler initialized - bootstrap host:
{
self
.
bootstrap_host
}
, bootstrap port:
{
self
.
bootstrap_port
}
"
f
"Prefill worker handler initialized - bootstrap host:
{
self
.
bootstrap_host
}
, bootstrap port:
{
self
.
bootstrap_port
}
"
...
...
components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
View file @
b94f9dcd
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
logging
import
logging
from
typing
import
AsyncIterator
from
typing
import
AsyncIterator
,
Optional
import
torch
import
torch
from
sglang.srt.parser.conversation
import
chat_templates
from
sglang.srt.parser.conversation
import
chat_templates
...
@@ -45,8 +46,11 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
...
@@ -45,8 +46,11 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
component
:
Component
,
component
:
Component
,
config
:
Config
,
config
:
Config
,
pd_worker_client
:
Client
,
pd_worker_client
:
Client
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
)
->
None
:
)
->
None
:
super
().
__init__
(
component
,
engine
=
None
,
config
=
config
)
super
().
__init__
(
component
,
engine
=
None
,
config
=
config
,
shutdown_event
=
shutdown_event
)
self
.
pd_worker_client
=
pd_worker_client
self
.
pd_worker_client
=
pd_worker_client
self
.
model
=
config
.
server_args
.
model_path
self
.
model
=
config
.
server_args
.
model_path
self
.
served_model_name
=
config
.
server_args
.
served_model_name
self
.
served_model_name
=
config
.
server_args
.
served_model_name
...
...
components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py
View file @
b94f9dcd
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
json
import
json
import
logging
import
logging
import
time
import
time
import
uuid
import
uuid
from
typing
import
Any
,
Dict
from
typing
import
Any
,
Dict
,
Optional
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
...
@@ -36,8 +37,11 @@ class MultimodalProcessorHandler(BaseWorkerHandler):
...
@@ -36,8 +37,11 @@ class MultimodalProcessorHandler(BaseWorkerHandler):
component
:
Component
,
component
:
Component
,
config
:
Config
,
config
:
Config
,
encode_worker_client
:
Client
,
encode_worker_client
:
Client
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
):
):
super
().
__init__
(
component
,
engine
=
None
,
config
=
config
)
super
().
__init__
(
component
,
engine
=
None
,
config
=
config
,
shutdown_event
=
shutdown_event
)
self
.
encode_worker_client
=
encode_worker_client
self
.
encode_worker_client
=
encode_worker_client
self
.
chat_template
=
getattr
(
config
.
server_args
,
"chat_template"
,
"qwen2-vl"
)
self
.
chat_template
=
getattr
(
config
.
server_args
,
"chat_template"
,
"qwen2-vl"
)
self
.
model
=
config
.
server_args
.
model_path
self
.
model
=
config
.
server_args
.
model_path
...
...
components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py
View file @
b94f9dcd
...
@@ -4,13 +4,14 @@
...
@@ -4,13 +4,14 @@
import
asyncio
import
asyncio
import
json
import
json
import
logging
import
logging
from
typing
import
AsyncIterator
from
typing
import
AsyncIterator
,
Optional
import
sglang
as
sgl
import
sglang
as
sgl
import
torch
import
torch
import
dynamo.nixl_connect
as
connect
import
dynamo.nixl_connect
as
connect
from
dynamo._core
import
Client
,
Component
,
Context
from
dynamo._core
import
Client
,
Component
,
Context
from
dynamo.common.utils.engine_response
import
normalize_finish_reason
from
dynamo.sglang.args
import
Config
,
DisaggregationMode
from
dynamo.sglang.args
import
Config
,
DisaggregationMode
from
dynamo.sglang.protocol
import
(
from
dynamo.sglang.protocol
import
(
DisaggSglangMultimodalRequest
,
DisaggSglangMultimodalRequest
,
...
@@ -165,7 +166,9 @@ class StreamProcessor:
...
@@ -165,7 +166,9 @@ class StreamProcessor:
if
finish_reason
:
if
finish_reason
:
output
.
update
(
output
.
update
(
{
{
"finish_reason"
:
finish_reason
.
get
(
"type"
,
"stop"
),
"finish_reason"
:
normalize_finish_reason
(
finish_reason
.
get
(
"type"
,
"stop"
)
),
"finished"
:
True
,
"finished"
:
True
,
}
}
)
)
...
@@ -248,8 +251,9 @@ class MultimodalWorkerHandler(BaseWorkerHandler):
...
@@ -248,8 +251,9 @@ class MultimodalWorkerHandler(BaseWorkerHandler):
engine
:
sgl
.
Engine
,
engine
:
sgl
.
Engine
,
config
:
Config
,
config
:
Config
,
prefill_client
:
Client
=
None
,
prefill_client
:
Client
=
None
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
):
):
super
().
__init__
(
component
,
engine
,
config
,
None
)
super
().
__init__
(
component
,
engine
,
config
,
None
,
None
,
shutdown_event
)
# Initialize processors
# Initialize processors
self
.
embeddings_processor
=
EmbeddingsProcessor
()
self
.
embeddings_processor
=
EmbeddingsProcessor
()
...
@@ -423,8 +427,14 @@ class MultimodalPrefillWorkerHandler(BaseWorkerHandler):
...
@@ -423,8 +427,14 @@ class MultimodalPrefillWorkerHandler(BaseWorkerHandler):
Processes multimodal inputs and coordinates with decode worker.
Processes multimodal inputs and coordinates with decode worker.
"""
"""
def
__init__
(
self
,
component
:
Component
,
engine
:
sgl
.
Engine
,
config
:
Config
):
def
__init__
(
super
().
__init__
(
component
,
engine
,
config
)
self
,
component
:
Component
,
engine
:
sgl
.
Engine
,
config
:
Config
,
shutdown_event
:
Optional
[
asyncio
.
Event
]
=
None
,
):
super
().
__init__
(
component
,
engine
,
config
,
None
,
None
,
shutdown_event
)
# Initialize processors
# Initialize processors
self
.
embeddings_processor
=
EmbeddingsProcessor
()
self
.
embeddings_processor
=
EmbeddingsProcessor
()
...
...
components/src/dynamo/vllm/handlers.py
View file @
b94f9dcd
...
@@ -22,6 +22,7 @@ from vllm.sampling_params import SamplingParams, StructuredOutputsParams
...
@@ -22,6 +22,7 @@ from vllm.sampling_params import SamplingParams, StructuredOutputsParams
from
vllm.v1.engine.exceptions
import
EngineDeadError
from
vllm.v1.engine.exceptions
import
EngineDeadError
import
dynamo.nixl_connect
as
nixl_connect
import
dynamo.nixl_connect
as
nixl_connect
from
dynamo.common.utils.engine_response
import
normalize_finish_reason
from
dynamo.common.utils.input_params
import
InputParamManager
from
dynamo.common.utils.input_params
import
InputParamManager
from
dynamo.common.utils.media_nixl
import
read_decoded_media_via_nixl
from
dynamo.common.utils.media_nixl
import
read_decoded_media_via_nixl
from
dynamo.common.utils.otel_tracing
import
build_trace_headers
from
dynamo.common.utils.otel_tracing
import
build_trace_headers
...
@@ -436,20 +437,6 @@ class BaseWorkerHandler(ABC):
...
@@ -436,20 +437,6 @@ class BaseWorkerHandler(ABC):
self
.
_lora_load_locks
[
lora_name
]
=
lock
self
.
_lora_load_locks
[
lora_name
]
=
lock
return
lock
return
lock
def
_normalize_finish_reason
(
self
,
finish_reason
:
str
)
->
str
:
"""
Normalize vLLM finish reasons to Dynamo-compatible values.
vLLM may return finish reasons that aren't recognized by Dynamo's Rust layer.
This method maps them to compatible values.
[TODO]: Remove this method and add the right code in the Rust layer.
"""
# Map vLLM's "abort" to Dynamo's "cancelled"
if
finish_reason
.
startswith
(
"abort"
):
logging
.
debug
(
f
"Normalizing finish reason:
{
finish_reason
}
to cancelled"
)
return
"cancelled"
return
finish_reason
async
def
load_lora
(
self
,
request
=
None
):
async
def
load_lora
(
self
,
request
=
None
):
"""
"""
Load a LoRA adapter dynamically into the vLLM's AsyncLLM engine.
Load a LoRA adapter dynamically into the vLLM's AsyncLLM engine.
...
@@ -1223,9 +1210,7 @@ class BaseWorkerHandler(ABC):
...
@@ -1223,9 +1210,7 @@ class BaseWorkerHandler(ABC):
out
[
"top_logprobs"
]
=
top_logprobs
out
[
"top_logprobs"
]
=
top_logprobs
if
output
.
finish_reason
:
if
output
.
finish_reason
:
out
[
"finish_reason"
]
=
self
.
_normalize_finish_reason
(
out
[
"finish_reason"
]
=
normalize_finish_reason
(
output
.
finish_reason
)
output
.
finish_reason
)
out
[
"completion_usage"
]
=
BaseWorkerHandler
.
_build_completion_usage
(
out
[
"completion_usage"
]
=
BaseWorkerHandler
.
_build_completion_usage
(
request_output
=
res
,
request_output
=
res
,
embedding_sequence_length
=
embedding_sequence_length
,
embedding_sequence_length
=
embedding_sequence_length
,
...
@@ -1438,9 +1423,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -1438,9 +1423,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
"role"
:
"assistant"
,
"role"
:
"assistant"
,
"content"
:
delta_text
,
"content"
:
delta_text
,
},
},
"finish_reason"
:
self
.
_normalize_finish_reason
(
"finish_reason"
:
normalize_finish_reason
(
output
.
finish_reason
),
output
.
finish_reason
),
}
}
chunk
=
{
chunk
=
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment