Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2d0dab74
Unverified
Commit
2d0dab74
authored
Oct 10, 2025
by
Indrajit Bhosale
Committed by
GitHub
Oct 10, 2025
Browse files
feat: Sglang Request cancellation (#3465)
Signed-off-by:
Indrajit Bhosale
<
iamindrajitb@gmail.com
>
parent
6e8529fd
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
603 additions
and
69 deletions
+603
-69
components/src/dynamo/sglang/main.py
components/src/dynamo/sglang/main.py
+1
-1
components/src/dynamo/sglang/request_handlers/embedding/embedding_handler.py
...mo/sglang/request_handlers/embedding/embedding_handler.py
+9
-2
components/src/dynamo/sglang/request_handlers/handler_base.py
...onents/src/dynamo/sglang/request_handlers/handler_base.py
+111
-3
components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
.../src/dynamo/sglang/request_handlers/llm/decode_handler.py
+93
-49
components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
...src/dynamo/sglang/request_handlers/llm/prefill_handler.py
+33
-6
components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
...lang/request_handlers/multimodal/encode_worker_handler.py
+11
-2
components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py
...o/sglang/request_handlers/multimodal/processor_handler.py
+9
-2
components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py
...namo/sglang/request_handlers/multimodal/worker_handler.py
+13
-3
tests/fault_tolerance/cancellation/test_sglang.py
tests/fault_tolerance/cancellation/test_sglang.py
+322
-0
tests/serve/test_sglang.py
tests/serve/test_sglang.py
+1
-1
No files found.
components/src/dynamo/sglang/main.py
View file @
2d0dab74
...
@@ -108,7 +108,7 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -108,7 +108,7 @@ async def init(runtime: DistributedRuntime, config: Config):
try
:
try
:
# Start endpoint immediately and register model concurrently
# Start endpoint immediately and register model concurrently
# Requests queue until ready_event is set
# Requests queue until ready_event is set
(TODO: Part of new PR)
await
asyncio
.
gather
(
await
asyncio
.
gather
(
generate_endpoint
.
serve_endpoint
(
generate_endpoint
.
serve_endpoint
(
handler
.
generate
,
handler
.
generate
,
...
...
components/src/dynamo/sglang/request_handlers/embedding/embedding_handler.py
View file @
2d0dab74
...
@@ -6,7 +6,7 @@ from typing import Optional
...
@@ -6,7 +6,7 @@ from typing import Optional
import
sglang
as
sgl
import
sglang
as
sgl
from
dynamo._core
import
Component
from
dynamo._core
import
Component
,
Context
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.protocol
import
EmbeddingRequest
from
dynamo.sglang.protocol
import
EmbeddingRequest
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
...
@@ -29,7 +29,14 @@ class EmbeddingWorkerHandler(BaseWorkerHandler):
...
@@ -29,7 +29,14 @@ class EmbeddingWorkerHandler(BaseWorkerHandler):
logging
.
info
(
"Engine shutdown"
)
logging
.
info
(
"Engine shutdown"
)
super
().
cleanup
()
super
().
cleanup
()
async
def
generate
(
self
,
request
:
dict
):
async
def
generate
(
self
,
request
:
dict
,
context
:
Context
):
"""
Generate embeddings for the given input.
Args:
request: Embedding request dictionary.
context: Context object for cancellation handling.
"""
logging
.
debug
(
f
"Embedding request:
{
request
}
"
)
logging
.
debug
(
f
"Embedding request:
{
request
}
"
)
# Parse the embedding request - should only receive EmbeddingRequest format
# Parse the embedding request - should only receive EmbeddingRequest format
...
...
components/src/dynamo/sglang/request_handlers/handler_base.py
View file @
2d0dab74
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
logging
import
random
import
random
import
socket
import
socket
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
Dict
,
Optional
,
Tuple
from
contextlib
import
asynccontextmanager
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
Optional
,
Tuple
import
sglang
as
sgl
import
sglang
as
sgl
from
sglang.srt.utils
import
get_local_ip_auto
from
sglang.srt.utils
import
get_local_ip_auto
from
dynamo._core
import
Client
,
Component
from
dynamo._core
import
Client
,
Component
,
Context
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
...
@@ -48,11 +51,12 @@ class BaseWorkerHandler(ABC):
...
@@ -48,11 +51,12 @@ class BaseWorkerHandler(ABC):
self
.
skip_tokenizer_init
=
config
.
server_args
.
skip_tokenizer_init
self
.
skip_tokenizer_init
=
config
.
server_args
.
skip_tokenizer_init
@
abstractmethod
@
abstractmethod
async
def
generate
(
self
,
request
:
Dict
[
str
,
Any
]):
async
def
generate
(
self
,
request
:
Dict
[
str
,
Any
]
,
context
:
Context
):
"""Generate response from request.
"""Generate response from request.
Args:
Args:
request: Request dict with input and parameters.
request: Request dict with input and parameters.
context: Context object for cancellation handling.
Yields:
Yields:
Response data (format varies by handler implementation).
Response data (format varies by handler implementation).
...
@@ -112,3 +116,107 @@ class BaseWorkerHandler(ABC):
...
@@ -112,3 +116,107 @@ class BaseWorkerHandler(ABC):
bootstrap_host
=
get_local_ip_auto
()
bootstrap_host
=
get_local_ip_auto
()
return
bootstrap_host
,
bootstrap_port
return
bootstrap_host
,
bootstrap_port
async
def
_handle_cancellation
(
self
,
request_id_future
:
asyncio
.
Future
,
context
:
Context
):
"""Background task to handle cancellation by monitoring context state.
Args:
request_id_future: Future that will be set with the SGLang request ID
when the first response arrives.
context: Context object for cancellation handling.
"""
try
:
logging
.
debug
(
f
"Cancellation monitor started for Context:
{
context
.
id
()
}
"
)
# Always wait for the request ID to ensure we can abort the request
sglang_request_id
=
await
request_id_future
logging
.
debug
(
f
"Cancellation monitor received SGLang Request ID
{
sglang_request_id
}
for Context:
{
context
.
id
()
}
"
)
logging
.
debug
(
f
"Request ID future cancelled for Context:
{
context
.
id
()
}
"
)
await
context
.
async_killed_or_stopped
()
logging
.
info
(
f
"Cancellation signal received for SGLang Request ID
{
sglang_request_id
}
, Context:
{
context
.
id
()
}
"
)
# Call abort_request on the tokenizer_manager through the engine
if
(
hasattr
(
self
.
engine
,
"tokenizer_manager"
)
and
self
.
engine
.
tokenizer_manager
):
logging
.
info
(
f
"Calling SGLang abort_request for Request ID
{
sglang_request_id
}
"
)
self
.
engine
.
tokenizer_manager
.
abort_request
(
rid
=
sglang_request_id
,
abort_all
=
False
)
logging
.
info
(
f
"Aborted Request ID:
{
context
.
id
()
}
"
)
else
:
logging
.
error
(
f
"SGLang tokenizer_manager not found for abort request:
{
context
.
id
()
}
"
)
except
asyncio
.
CancelledError
:
# Task was cancelled, which is expected when generation completes
request_id
=
"unknown"
if
request_id_future
.
done
()
and
not
request_id_future
.
cancelled
():
try
:
request_id
=
request_id_future
.
result
()
except
Exception
:
pass
logging
.
debug
(
f
"Cancellation monitor task cancelled for SGLang Request ID
{
request_id
}
, Context:
{
context
.
id
()
}
"
)
raise
@
asynccontextmanager
async
def
_cancellation_monitor
(
self
,
request_id_future
:
asyncio
.
Future
,
context
:
Context
)
->
AsyncGenerator
[
asyncio
.
Task
,
None
]:
"""
Context manager for monitoring request cancellation.
Automatically creates a background task to monitor for cancellation and
cleans it up when the context exits.
Args:
request_id_future: Future that will be set with the SGLang request ID
when the first response arrives.
context: Context object for cancellation handling
Yields:
asyncio.Task: The cancellation monitoring task being managed
"""
logging
.
info
(
f
"Creating cancellation monitor task for Context:
{
context
.
id
()
}
"
)
# Start the cancellation monitoring task
cancellation_task
=
asyncio
.
create_task
(
self
.
_handle_cancellation
(
request_id_future
,
context
)
)
try
:
yield
cancellation_task
finally
:
# Clean up the background cancellation task
request_id
=
"unknown"
if
request_id_future
.
done
()
and
not
request_id_future
.
cancelled
():
try
:
request_id
=
request_id_future
.
result
()
except
Exception
:
pass
if
not
cancellation_task
.
done
():
logging
.
debug
(
f
"Cancelling cancellation monitor task for SGLang Request ID
{
request_id
}
, Context:
{
context
.
id
()
}
"
)
cancellation_task
.
cancel
()
try
:
await
cancellation_task
except
asyncio
.
CancelledError
:
pass
else
:
logging
.
debug
(
f
"Cancellation monitor task already completed for SGLang Request ID
{
request_id
}
, Context:
{
context
.
id
()
}
"
)
components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
View file @
2d0dab74
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
logging
import
logging
import
time
import
time
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
Optional
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
Optional
import
sglang
as
sgl
import
sglang
as
sgl
from
dynamo._core
import
Client
,
Component
from
dynamo._core
import
Client
,
Component
,
Context
from
dynamo.sglang.args
import
Config
,
DisaggregationMode
from
dynamo.sglang.args
import
Config
,
DisaggregationMode
from
dynamo.sglang.protocol
import
DisaggPreprocessedRequest
from
dynamo.sglang.protocol
import
DisaggPreprocessedRequest
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
...
@@ -96,12 +97,13 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -96,12 +97,13 @@ class DecodeWorkerHandler(BaseWorkerHandler):
return
{
k
:
v
for
k
,
v
in
param_mapping
.
items
()
if
v
is
not
None
}
return
{
k
:
v
for
k
,
v
in
param_mapping
.
items
()
if
v
is
not
None
}
async
def
generate
(
async
def
generate
(
self
,
request
:
Dict
[
str
,
Any
]
self
,
request
:
Dict
[
str
,
Any
]
,
context
:
Context
)
->
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]:
)
->
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]:
"""Generate response in aggregated or disaggregated mode.
"""Generate response in aggregated or disaggregated mode.
Args:
Args:
request: Request dict with input and sampling parameters.
request: Request dict with input and sampling parameters.
context: Context object for cancellation handling.
Yields:
Yields:
Response dicts with token_ids or OpenAI-formatted chunks.
Response dicts with token_ids or OpenAI-formatted chunks.
...
@@ -109,6 +111,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -109,6 +111,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
Raises:
Raises:
RuntimeError: If no bootstrap info received from prefill worker.
RuntimeError: If no bootstrap info received from prefill worker.
"""
"""
logging
.
debug
(
f
"New Request ID:
{
context
.
id
()
}
"
)
sampling_params
=
self
.
_build_sampling_params
(
request
)
sampling_params
=
self
.
_build_sampling_params
(
request
)
input_param
=
self
.
_get_input_param
(
request
)
input_param
=
self
.
_get_input_param
(
request
)
...
@@ -139,7 +142,8 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -139,7 +142,8 @@ class DecodeWorkerHandler(BaseWorkerHandler):
DisaggPreprocessedRequest
(
DisaggPreprocessedRequest
(
request
=
request
,
request
=
request
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
).
model_dump
()
).
model_dump
(),
context
=
context
,
)
)
bootstrap_info
=
None
bootstrap_info
=
None
...
@@ -160,10 +164,10 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -160,10 +164,10 @@ class DecodeWorkerHandler(BaseWorkerHandler):
)
)
if
self
.
skip_tokenizer_init
:
if
self
.
skip_tokenizer_init
:
async
for
out
in
self
.
_process_token_stream
(
decode
):
async
for
out
in
self
.
_process_token_stream
(
decode
,
context
):
yield
out
yield
out
else
:
else
:
async
for
out
in
self
.
_process_text_stream
(
decode
):
async
for
out
in
self
.
_process_text_stream
(
decode
,
context
):
yield
out
yield
out
else
:
else
:
agg
=
await
self
.
engine
.
async_generate
(
agg
=
await
self
.
engine
.
async_generate
(
...
@@ -172,76 +176,116 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -172,76 +176,116 @@ class DecodeWorkerHandler(BaseWorkerHandler):
stream
=
True
,
stream
=
True
,
)
)
if
self
.
skip_tokenizer_init
:
if
self
.
skip_tokenizer_init
:
async
for
out
in
self
.
_process_token_stream
(
agg
):
async
for
out
in
self
.
_process_token_stream
(
agg
,
context
):
yield
out
yield
out
else
:
else
:
async
for
out
in
self
.
_process_text_stream
(
agg
):
async
for
out
in
self
.
_process_text_stream
(
agg
,
context
):
yield
out
yield
out
async
def
_process_token_stream
(
async
def
_process_token_stream
(
self
,
stream_source
:
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]
self
,
stream_source
:
AsyncGenerator
[
Dict
[
str
,
Any
],
None
],
context
:
Context
,
)
->
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]:
)
->
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]:
"""Process token-based stream output.
"""Process token-based stream output.
Args:
Args:
stream_source: Async generator from engine.async_generate.
stream_source: Async generator from engine.async_generate.
context: Context object for cancellation handling.
Yields:
Yields:
Dict with token_ids and optional finish_reason.
Dict with token_ids and optional finish_reason.
"""
"""
num_output_tokens_so_far
=
0
num_output_tokens_so_far
=
0
async
for
res
in
stream_source
:
# Use Future pattern for request ID - will be set when first response arrives
out
=
{}
request_id_future
=
asyncio
.
Future
()
finish_reason
=
res
[
"meta_info"
][
"finish_reason"
]
async
with
self
.
_cancellation_monitor
(
request_id_future
,
context
):
if
finish_reason
:
async
for
res
in
stream_source
:
out
[
"finish_reason"
]
=
finish_reason
[
"type"
]
# Extract SGLang request ID from the first response and set the future
if
not
request_id_future
.
done
():
output_ids
=
res
.
get
(
"output_ids"
,
[])
meta_info
=
res
.
get
(
"meta_info"
,
{})
# If request is not finished yet, but there are no outputs, return an error.
sglang_request_id
=
meta_info
.
get
(
"id"
)
if
not
output_ids
and
not
finish_reason
:
if
sglang_request_id
:
yield
{
"finish_reason"
:
"error"
,
"token_ids"
:
[]}
request_id_future
.
set_result
(
sglang_request_id
)
break
logging
.
debug
(
f
"New SGLang Request ID:
{
sglang_request_id
}
"
)
next_total_toks
=
len
(
output_ids
)
# Check cancellation before yielding to allow proper cleanup.
out
[
"token_ids"
]
=
output_ids
[
num_output_tokens_so_far
:]
# This lets SGLang proceed to the second token generation, which will
num_output_tokens_so_far
=
next_total_toks
# async context switch and allow the abort monitor to signal cancellation.
yield
out
# The loop should exit by itself when context.is_stopped() returns True.
out
=
{}
finish_reason
=
res
[
"meta_info"
][
"finish_reason"
]
if
finish_reason
:
out
[
"finish_reason"
]
=
finish_reason
[
"type"
]
output_ids
=
res
.
get
(
"output_ids"
,
[])
# If request is not finished yet, but there are no outputs, return an error.
if
not
output_ids
and
not
finish_reason
:
if
not
context
.
is_stopped
():
yield
{
"finish_reason"
:
"error"
,
"token_ids"
:
[]}
break
next_total_toks
=
len
(
output_ids
)
out
[
"token_ids"
]
=
output_ids
[
num_output_tokens_so_far
:]
num_output_tokens_so_far
=
next_total_toks
if
not
context
.
is_stopped
():
yield
out
async
def
_process_text_stream
(
async
def
_process_text_stream
(
self
,
stream_source
:
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]
self
,
stream_source
:
AsyncGenerator
[
Dict
[
str
,
Any
],
None
],
context
:
Context
,
)
->
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]:
)
->
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]:
"""Process text-based stream output in OpenAI format.
"""Process text-based stream output in OpenAI format.
Args:
Args:
stream_source: Async generator from engine.async_generate.
stream_source: Async generator from engine.async_generate.
context: Context object for cancellation handling.
Yields:
Yields:
OpenAI-formatted chat completion chunk dicts.
OpenAI-formatted chat completion chunk dicts.
"""
"""
count
=
0
count
=
0
async
for
res
in
stream_source
:
# Use Future pattern for request ID - will be set when first response arrives
index
=
res
.
get
(
"index"
,
0
)
request_id_future
=
asyncio
.
Future
()
text
=
res
.
get
(
"text"
,
""
)
async
with
self
.
_cancellation_monitor
(
request_id_future
,
context
):
async
for
res
in
stream_source
:
finish_reason
=
res
[
"meta_info"
][
"finish_reason"
]
# Extract SGLang request ID from the first response and set the future
finish_reason_type
=
finish_reason
[
"type"
]
if
finish_reason
else
None
if
not
request_id_future
.
done
():
next_count
=
len
(
text
)
meta_info
=
res
.
get
(
"meta_info"
,
{})
delta
=
text
[
count
:]
sglang_request_id
=
meta_info
.
get
(
"id"
)
if
sglang_request_id
:
choice_data
=
{
request_id_future
.
set_result
(
sglang_request_id
)
"index"
:
index
,
logging
.
debug
(
f
"New SGLang Request ID:
{
sglang_request_id
}
"
)
"delta"
:
{
"role"
:
"assistant"
,
"content"
:
delta
},
"finish_reason"
:
finish_reason_type
,
# Check cancellation before yielding to allow proper cleanup.
}
# This lets SGLang proceed to the second token generation, which will
# async context switch and allow the abort monitor to signal cancellation.
response
=
{
# The loop should exit by itself when context.is_stopped() returns True.
"id"
:
res
[
"meta_info"
][
"id"
],
"created"
:
int
(
time
.
time
()),
index
=
res
.
get
(
"index"
,
0
)
"choices"
:
[
choice_data
],
text
=
res
.
get
(
"text"
,
""
)
"model"
:
self
.
config
.
server_args
.
served_model_name
,
"object"
:
"chat.completion.chunk"
,
finish_reason
=
res
[
"meta_info"
][
"finish_reason"
]
}
finish_reason_type
=
finish_reason
[
"type"
]
if
finish_reason
else
None
yield
response
next_count
=
len
(
text
)
count
=
next_count
delta
=
text
[
count
:]
choice_data
=
{
"index"
:
index
,
"delta"
:
{
"role"
:
"assistant"
,
"content"
:
delta
},
"finish_reason"
:
finish_reason_type
,
}
response
=
{
"id"
:
res
[
"meta_info"
][
"id"
],
"created"
:
int
(
time
.
time
()),
"choices"
:
[
choice_data
],
"model"
:
self
.
config
.
server_args
.
served_model_name
,
"object"
:
"chat.completion.chunk"
,
}
if
not
context
.
is_stopped
():
yield
response
count
=
next_count
components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
View file @
2d0dab74
...
@@ -7,7 +7,7 @@ from typing import Any, AsyncGenerator, Dict
...
@@ -7,7 +7,7 @@ from typing import Any, AsyncGenerator, Dict
import
sglang
as
sgl
import
sglang
as
sgl
from
dynamo._core
import
Component
from
dynamo._core
import
Component
,
Context
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.request_handlers.handler_base
import
BaseWorkerHandler
from
dynamo.sglang.request_handlers.handler_base
import
BaseWorkerHandler
...
@@ -34,27 +34,36 @@ class PrefillWorkerHandler(BaseWorkerHandler):
...
@@ -34,27 +34,36 @@ class PrefillWorkerHandler(BaseWorkerHandler):
self
.
engine
=
engine
self
.
engine
=
engine
self
.
bootstrap_host
,
self
.
bootstrap_port
=
self
.
_get_bootstrap_info
(
self
.
engine
)
self
.
bootstrap_host
,
self
.
bootstrap_port
=
self
.
_get_bootstrap_info
(
self
.
engine
)
super
().
__init__
(
component
,
engine
,
config
,
publisher
)
super
().
__init__
(
component
,
engine
,
config
,
publisher
)
self
.
_consume_tasks
=
set
()
logging
.
info
(
logging
.
info
(
f
"Prefill worker handler initialized - bootstrap host:
{
self
.
bootstrap_host
}
, bootstrap port:
{
self
.
bootstrap_port
}
"
f
"Prefill worker handler initialized - bootstrap host:
{
self
.
bootstrap_host
}
, bootstrap port:
{
self
.
bootstrap_port
}
"
)
)
def
cleanup
(
self
)
->
None
:
def
cleanup
(
self
)
->
None
:
"""Shutdown the prefill engine and cleanup resources."""
"""Shutdown the prefill engine and cleanup resources."""
# Cancel all pending consume tasks
for
task
in
self
.
_consume_tasks
:
if
not
task
.
done
():
task
.
cancel
()
self
.
_consume_tasks
.
clear
()
self
.
engine
.
shutdown
()
self
.
engine
.
shutdown
()
logging
.
info
(
"Prefill engine shutdown"
)
logging
.
info
(
"Prefill engine shutdown"
)
super
().
cleanup
()
super
().
cleanup
()
async
def
generate
(
async
def
generate
(
self
,
request
:
Dict
[
str
,
Any
]
self
,
request
:
Dict
[
str
,
Any
]
,
context
:
Context
)
->
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]:
)
->
AsyncGenerator
[
Dict
[
str
,
Any
],
None
]:
"""Generate prefill output and provide bootstrap info for decode worker.
"""Generate prefill output and provide bootstrap info for decode worker.
Args:
Args:
request: Request dict with 'request' and 'sampling_params' keys.
request: Request dict with 'request' and 'sampling_params' keys.
context: Context object for cancellation handling.
Yields:
Yields:
Bootstrap info dict with host, port, and room for decode worker connection.
Bootstrap info dict with host, port, and room for decode worker connection.
"""
"""
logging
.
debug
(
f
"New Request ID:
{
context
.
id
()
}
"
)
bootstrap_room
=
self
.
_generate_bootstrap_room
()
bootstrap_room
=
self
.
_generate_bootstrap_room
()
bootstrap_info
=
{
bootstrap_info
=
{
...
@@ -76,13 +85,31 @@ class PrefillWorkerHandler(BaseWorkerHandler):
...
@@ -76,13 +85,31 @@ class PrefillWorkerHandler(BaseWorkerHandler):
bootstrap_room
=
bootstrap_room
,
bootstrap_room
=
bootstrap_room
,
)
)
asyncio
.
create_task
(
self
.
_consume_results
(
results
))
task
=
asyncio
.
create_task
(
self
.
_consume_results
(
results
,
context
))
self
.
_consume_tasks
.
add
(
task
)
task
.
add_done_callback
(
self
.
_consume_tasks
.
discard
)
async
def
_consume_results
(
self
,
results
:
AsyncGenerator
[
Any
,
None
])
->
None
:
async
def
_consume_results
(
self
,
results
:
AsyncGenerator
[
Any
,
None
],
context
:
Context
)
->
None
:
"""Consume async generator results without processing.
"""Consume async generator results without processing.
Args:
Args:
results: Async generator from engine.async_generate.
results: Async generator from engine.async_generate.
context: Context object for cancellation handling.
"""
"""
async
for
_
in
results
:
# Use Future pattern for request ID - will be set when first response arrives
pass
request_id_future
=
asyncio
.
Future
()
async
with
self
.
_cancellation_monitor
(
request_id_future
,
context
):
async
for
res
in
results
:
# Extract SGLang request ID from the first response and set the future
if
not
request_id_future
.
done
():
meta_info
=
res
.
get
(
"meta_info"
,
{})
sglang_request_id
=
meta_info
.
get
(
"id"
)
if
sglang_request_id
:
request_id_future
.
set_result
(
sglang_request_id
)
logging
.
debug
(
f
"New Prefill Request ID:
{
sglang_request_id
}
"
)
# Note: No explicit cancellation checks needed here.
# When abort_request is called by the cancellation monitor,
# SGLang will terminate this async generator automatically.
components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
View file @
2d0dab74
...
@@ -9,7 +9,7 @@ from sglang.srt.parser.conversation import chat_templates
...
@@ -9,7 +9,7 @@ from sglang.srt.parser.conversation import chat_templates
from
transformers
import
AutoImageProcessor
,
AutoModel
,
AutoTokenizer
from
transformers
import
AutoImageProcessor
,
AutoModel
,
AutoTokenizer
import
dynamo.nixl_connect
as
connect
import
dynamo.nixl_connect
as
connect
from
dynamo._core
import
Client
,
Component
from
dynamo._core
import
Client
,
Component
,
Context
from
dynamo.runtime
import
DistributedRuntime
from
dynamo.runtime
import
DistributedRuntime
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.multimodal_utils
import
ImageLoader
,
encode_image_embeddings
from
dynamo.sglang.multimodal_utils
import
ImageLoader
,
encode_image_embeddings
...
@@ -90,7 +90,16 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
...
@@ -90,7 +90,16 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
def
cleanup
(
self
):
def
cleanup
(
self
):
pass
pass
async
def
generate
(
self
,
request
:
SglangMultimodalRequest
)
->
AsyncIterator
[
str
]:
async
def
generate
(
self
,
request
:
SglangMultimodalRequest
,
context
:
Context
)
->
AsyncIterator
[
str
]:
"""
Generate precomputed embeddings for multimodal input.
Args:
request: Multimodal request with image/video data.
context: Context object for cancellation handling.
"""
if
not
isinstance
(
request
,
SglangMultimodalRequest
):
if
not
isinstance
(
request
,
SglangMultimodalRequest
):
if
isinstance
(
request
,
str
):
if
isinstance
(
request
,
str
):
request
=
SglangMultimodalRequest
.
model_validate_json
(
request
)
request
=
SglangMultimodalRequest
.
model_validate_json
(
request
)
...
...
components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py
View file @
2d0dab74
...
@@ -9,7 +9,7 @@ from typing import Any, Dict
...
@@ -9,7 +9,7 @@ from typing import Any, Dict
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
dynamo._core
import
Client
,
Component
from
dynamo._core
import
Client
,
Component
,
Context
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.multimodal_utils
import
(
from
dynamo.sglang.multimodal_utils
import
(
multimodal_request_to_sglang
,
multimodal_request_to_sglang
,
...
@@ -54,7 +54,14 @@ class MultimodalProcessorHandler(BaseWorkerHandler):
...
@@ -54,7 +54,14 @@ class MultimodalProcessorHandler(BaseWorkerHandler):
def
cleanup
(
self
):
def
cleanup
(
self
):
pass
pass
async
def
generate
(
self
,
raw_request
:
MultiModalRequest
):
async
def
generate
(
self
,
raw_request
:
MultiModalRequest
,
context
:
Context
):
"""
Process multimodal request and forward to encode worker.
Args:
raw_request: Raw multimodal request to process.
context: Context object for cancellation handling.
"""
if
not
isinstance
(
raw_request
,
MultiModalRequest
):
if
not
isinstance
(
raw_request
,
MultiModalRequest
):
# If the request is not MultiModalRequest, convert it to MultiModalRequest
# If the request is not MultiModalRequest, convert it to MultiModalRequest
raw_request
=
MultiModalRequest
.
model_validate
(
raw_request
)
raw_request
=
MultiModalRequest
.
model_validate
(
raw_request
)
...
...
components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py
View file @
2d0dab74
...
@@ -10,7 +10,7 @@ import sglang as sgl
...
@@ -10,7 +10,7 @@ import sglang as sgl
import
torch
import
torch
import
dynamo.nixl_connect
as
connect
import
dynamo.nixl_connect
as
connect
from
dynamo._core
import
Client
,
Component
from
dynamo._core
import
Client
,
Component
,
Context
from
dynamo.sglang.args
import
Config
,
DisaggregationMode
from
dynamo.sglang.args
import
Config
,
DisaggregationMode
from
dynamo.sglang.protocol
import
(
from
dynamo.sglang.protocol
import
(
DisaggSglangMultimodalRequest
,
DisaggSglangMultimodalRequest
,
...
@@ -275,10 +275,16 @@ class MultimodalWorkerHandler(BaseWorkerHandler):
...
@@ -275,10 +275,16 @@ class MultimodalWorkerHandler(BaseWorkerHandler):
request
=
SglangMultimodalRequest
.
model_validate
(
request
)
request
=
SglangMultimodalRequest
.
model_validate
(
request
)
return
request
return
request
async
def
generate
(
self
,
request
:
SglangMultimodalRequest
)
->
AsyncIterator
[
str
]:
async
def
generate
(
self
,
request
:
SglangMultimodalRequest
,
context
:
Context
)
->
AsyncIterator
[
str
]:
"""
"""
Generate response using SGLang with multimodal data
Generate response using SGLang with multimodal data
Handles both aggregated and disaggregated modes (following regular SGLang DecodeWorkerHandler pattern)
Handles both aggregated and disaggregated modes (following regular SGLang DecodeWorkerHandler pattern)
Args:
request: Multimodal request with input and parameters.
context: Context object for cancellation handling.
"""
"""
try
:
try
:
request
=
self
.
_validate_and_parse_request
(
request
)
request
=
self
.
_validate_and_parse_request
(
request
)
...
@@ -429,10 +435,14 @@ class MultimodalPrefillWorkerHandler(BaseWorkerHandler):
...
@@ -429,10 +435,14 @@ class MultimodalPrefillWorkerHandler(BaseWorkerHandler):
await
self
.
embeddings_processor
.
initialize
()
await
self
.
embeddings_processor
.
initialize
()
async
def
generate
(
async
def
generate
(
self
,
disagg_request
:
DisaggSglangMultimodalRequest
self
,
disagg_request
:
DisaggSglangMultimodalRequest
,
context
:
Context
)
->
AsyncIterator
[
str
]:
)
->
AsyncIterator
[
str
]:
"""
"""
Handle prefill phase: process multimodal input and provide bootstrap info
Handle prefill phase: process multimodal input and provide bootstrap info
Args:
disagg_request: Disaggregated multimodal request.
context: Context object for cancellation handling.
"""
"""
bootstrap_room
=
None
bootstrap_room
=
None
try
:
try
:
...
...
tests/fault_tolerance/cancellation/test_sglang.py
0 → 100644
View file @
2d0dab74
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
logging
import
os
import
shutil
import
time
import
pytest
from
tests.fault_tolerance.cancellation.utils
import
(
DynamoFrontendProcess
,
poll_for_pattern
,
read_streaming_responses
,
send_cancellable_request
,
)
from
tests.utils.constants
import
FAULT_TOLERANCE_MODEL_NAME
from
tests.utils.engine_process
import
FRONTEND_PORT
from
tests.utils.managed_process
import
ManagedProcess
from
tests.utils.payloads
import
check_health_generate
,
check_models_api
logger
=
logging
.
getLogger
(
__name__
)
class
DynamoWorkerProcess
(
ManagedProcess
):
"""Process manager for Dynamo worker with SGLang backend"""
def
__init__
(
self
,
request
,
mode
:
str
=
"agg"
):
"""
Initialize SGLang worker process.
Args:
request: pytest request object
mode: One of "agg", "prefill", "decode"
"""
command
=
[
"python3"
,
"-m"
,
"dynamo.sglang"
,
"--model-path"
,
FAULT_TOLERANCE_MODEL_NAME
,
"--served-model-name"
,
FAULT_TOLERANCE_MODEL_NAME
,
"--page-size"
,
"16"
,
"--tp"
,
"1"
,
"--trust-remote-code"
,
]
# Add mode-specific arguments
if
mode
==
"agg"
:
# Aggregated mode - add skip-tokenizer-init like the serve test
command
.
append
(
"--skip-tokenizer-init"
)
else
:
# Disaggregated mode - add disaggregation arguments like disagg.sh
command
.
extend
(
[
"--disaggregation-mode"
,
mode
,
"--disaggregation-bootstrap-port"
,
"12345"
,
"--host"
,
"0.0.0.0"
,
"--disaggregation-transfer-backend"
,
"nixl"
,
]
)
health_check_urls
=
[
(
f
"http://localhost:
{
FRONTEND_PORT
}
/v1/models"
,
check_models_api
),
(
f
"http://localhost:
{
FRONTEND_PORT
}
/health"
,
check_health_generate
),
]
# Set port based on worker type
if
mode
==
"prefill"
:
port
=
"8082"
health_check_urls
=
[(
f
"http://localhost:
{
port
}
/health"
,
self
.
is_ready
)]
elif
mode
==
"decode"
:
port
=
"8081"
health_check_urls
=
[(
f
"http://localhost:
{
port
}
/health"
,
self
.
is_ready
)]
else
:
# agg (aggregated mode)
port
=
"8081"
# Set debug logging environment
env
=
os
.
environ
.
copy
()
env
[
"DYN_LOG"
]
=
"debug"
env
[
"DYN_SYSTEM_ENABLED"
]
=
"true"
env
[
"DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"
]
=
'["generate"]'
env
[
"DYN_SYSTEM_PORT"
]
=
port
# Set GPU assignment for disaggregated mode (like disagg.sh)
if
mode
==
"decode"
:
env
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
# Use GPU 1 for decode worker
elif
mode
==
"prefill"
:
env
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
# Use GPU 0 for prefill worker
# For agg (aggregated) mode, use default GPU assignment
# Set log directory based on worker type
log_dir
=
f
"
{
request
.
node
.
name
}
_
{
mode
}
_worker"
# Clean up any existing log directory from previous runs
try
:
shutil
.
rmtree
(
log_dir
)
logger
.
info
(
f
"Cleaned up existing log directory:
{
log_dir
}
"
)
except
FileNotFoundError
:
# Directory doesn't exist, which is fine
pass
super
().
__init__
(
command
=
command
,
env
=
env
,
health_check_urls
=
health_check_urls
,
timeout
=
300
,
display_output
=
True
,
terminate_existing
=
False
,
# Ensure any orphaned SGLang engine cores or child helpers are cleaned up
stragglers
=
[
"SGLANG:EngineCore"
,
],
straggler_commands
=
[
"-m dynamo.sglang"
,
],
log_dir
=
log_dir
,
)
self
.
mode
=
mode
def
get_pid
(
self
):
"""Get the PID of the worker process"""
return
self
.
proc
.
pid
if
self
.
proc
else
None
def
is_ready
(
self
,
response
)
->
bool
:
"""Check the health of the worker process"""
try
:
data
=
response
.
json
()
if
data
.
get
(
"status"
)
==
"ready"
:
logger
.
info
(
f
"
{
self
.
mode
.
capitalize
()
}
worker status is ready"
)
return
True
logger
.
warning
(
f
"
{
self
.
mode
.
capitalize
()
}
worker status is not ready:
{
data
.
get
(
'status'
)
}
"
)
except
ValueError
:
logger
.
warning
(
f
"
{
self
.
mode
.
capitalize
()
}
worker health response is not valid JSON"
)
return
False
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
sglang
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
xfail
(
strict
=
False
)
def
test_request_cancellation_sglang_aggregated
(
request
,
runtime_services
,
predownload_models
):
"""
End-to-end test for request cancellation functionality in aggregated mode.
This test verifies that when a request is cancelled by the client,
the system properly handles the cancellation and cleans up resources
on the worker side in aggregated (agg) mode.
TODO: Test is currently flaky/failing due to SGLang limitations with prefill cancellation.
See: https://github.com/sgl-project/sglang/issues/11139
"""
logger
.
info
(
"Sanity check if latest test is getting executed"
)
# Step 1: Start the frontend
with
DynamoFrontendProcess
(
request
)
as
frontend
:
logger
.
info
(
"Frontend started successfully"
)
# Step 2: Start an aggregated worker
with
DynamoWorkerProcess
(
request
,
mode
=
"agg"
)
as
worker
:
logger
.
info
(
f
"Aggregated Worker PID:
{
worker
.
get_pid
()
}
"
)
# TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
time
.
sleep
(
2
)
# Step 3: Test request cancellation with polling approach
frontend_log_offset
,
worker_log_offset
=
0
,
0
test_scenarios
=
[
(
"completion"
,
"Completion request cancellation"
),
(
"chat_completion"
,
"Chat completion request cancellation"
),
(
"chat_completion_stream"
,
"Chat completion stream request cancellation"
,
),
]
for
request_type
,
description
in
test_scenarios
:
logger
.
info
(
f
"Testing
{
description
.
lower
()
}
..."
)
# Send the request (non-blocking)
cancellable_req
=
send_cancellable_request
(
request_type
)
# Poll for "New Request ID" pattern (Dynamo context ID)
request_id
,
worker_log_offset
=
poll_for_pattern
(
process
=
worker
,
pattern
=
"New Request ID: "
,
log_offset
=
worker_log_offset
,
match_type
=
"contains"
,
)
# For streaming, read one response first to trigger SGLang ID logging
if
request_type
==
"chat_completion_stream"
:
read_streaming_responses
(
cancellable_req
,
expected_count
=
1
)
# Wait for SGLang to actually start processing (get SGLang request ID)
_
,
worker_log_offset
=
poll_for_pattern
(
process
=
worker
,
pattern
=
"New SGLang Request ID: "
,
log_offset
=
worker_log_offset
,
match_type
=
"contains"
,
)
# Now we know SGLang has the request, cancel it
cancellable_req
.
cancel
()
logger
.
info
(
f
"Cancelled request ID:
{
request_id
}
"
)
# Poll for "Aborted Request ID" with matching ID
_
,
worker_log_offset
=
poll_for_pattern
(
process
=
worker
,
pattern
=
f
"Aborted Request ID:
{
request_id
}
"
,
log_offset
=
worker_log_offset
,
max_wait_ms
=
2000
,
)
# Verify frontend log has kill message
_
,
frontend_log_offset
=
poll_for_pattern
(
process
=
frontend
,
pattern
=
"issued control message Kill to sender"
,
log_offset
=
frontend_log_offset
,
)
logger
.
info
(
f
"
{
description
}
detected successfully"
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
sglang
@
pytest
.
mark
.
gpu_2
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
def
test_request_cancellation_sglang_decode_cancel
(
request
,
runtime_services
,
predownload_models
):
"""
End-to-end test for request cancellation during remote decode phase.
This test verifies that when a request is cancelled by the client during the remote decode phase,
the system properly handles the cancellation and cleans up resources
on both the prefill and decode workers in a disaggregated setup.
Note: This test requires 2 GPUs to run decode and prefill workers on separate GPUs.
"""
# Step 1: Start the frontend
with
DynamoFrontendProcess
(
request
)
as
frontend
:
logger
.
info
(
"Frontend started successfully"
)
# Step 2: Start the decode worker
with
DynamoWorkerProcess
(
request
,
mode
=
"decode"
)
as
decode_worker
:
logger
.
info
(
f
"Decode Worker PID:
{
decode_worker
.
get_pid
()
}
"
)
# Step 3: Start the prefill worker
with
DynamoWorkerProcess
(
request
,
mode
=
"prefill"
)
as
prefill_worker
:
logger
.
info
(
f
"Prefill Worker PID:
{
prefill_worker
.
get_pid
()
}
"
)
# TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
time
.
sleep
(
2
)
# Step 4: Test request cancellation during remote decode phase
logger
.
info
(
"Testing chat completion stream request cancellation during remote decode phase..."
)
# Send streaming request (non-blocking)
cancellable_req
=
send_cancellable_request
(
"chat_completion_stream"
)
# Poll for "New Request ID" pattern in decode worker (Dynamo context ID)
request_id
,
decode_log_offset
=
poll_for_pattern
(
process
=
decode_worker
,
pattern
=
"New Request ID: "
,
match_type
=
"contains"
,
)
# Verify same request ID reached prefill worker
_
,
prefill_log_offset
=
poll_for_pattern
(
process
=
prefill_worker
,
pattern
=
f
"New Request ID:
{
request_id
}
"
,
)
# Read one response first to trigger SGLang ID logging in decode worker
read_streaming_responses
(
cancellable_req
,
expected_count
=
1
)
# Wait for SGLang to start processing in decode worker
_
,
decode_log_offset
=
poll_for_pattern
(
process
=
decode_worker
,
pattern
=
"New SGLang Request ID: "
,
log_offset
=
decode_log_offset
,
match_type
=
"contains"
,
)
# Now we know SGLang has the request in decode worker, cancel it
cancellable_req
.
cancel
()
logger
.
info
(
f
"Cancelled request ID:
{
request_id
}
"
)
# Poll for "Aborted Request ID" in decode worker
_
,
decode_log_offset
=
poll_for_pattern
(
process
=
decode_worker
,
pattern
=
f
"Aborted Request ID:
{
request_id
}
"
,
log_offset
=
decode_log_offset
,
)
# Verify frontend log has kill message
_
,
frontend_log_offset
=
poll_for_pattern
(
process
=
frontend
,
pattern
=
"issued control message Kill to sender"
,
)
logger
.
info
(
"Chat completion stream cancellation in decode phase detected successfully"
)
tests/serve/test_sglang.py
View file @
2d0dab74
...
@@ -116,7 +116,7 @@ sglang_configs = {
...
@@ -116,7 +116,7 @@ sglang_configs = {
# NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
# NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
# so we need something consistently found in the response, or a different
# so we need something consistently found in the response, or a different
# approach to validation for this test to be stable.
# approach to validation for this test to be stable.
expected_response
=
[
"
OUT OF SERVICE
"
],
expected_response
=
[
"
image
"
],
temperature
=
0.0
,
temperature
=
0.0
,
)
)
],
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment