feat: Sglang Request cancellation (#3465)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

feat: Sglang Request cancellation (#3465)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
2d0dab74 · Indrajit Bhosale · GitHub · 6e8529fd · 2d0dab74 · 2d0dab74
Unverified Commit 2d0dab74 authored Oct 10, 2025 by Indrajit Bhosale Committed by GitHub Oct 10, 2025
10 changed files
--- a/components/src/dynamo/sglang/main.py
+++ b/components/src/dynamo/sglang/main.py
@@ -108,7 +108,7 @@ async def init(runtime: DistributedRuntime, config: Config):
    try:
        # Start endpoint immediately and register model concurrently
-        # Requests queue until ready_event is set
+        # Requests queue until ready_event is set (TODO: Part of new PR)
        await asyncio.gather(
            generate_endpoint.serve_endpoint(
                handler.generate,

--- a/components/src/dynamo/sglang/request_handlers/embedding/embedding_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/embedding/embedding_handler.py
@@ -6,7 +6,7 @@ from typing import Optional
 import sglang as sgl
-from dynamo._core import Component
+from dynamo._core import Component, Context
 from dynamo.sglang.args import Config
 from dynamo.sglang.protocol import EmbeddingRequest
 from dynamo.sglang.publisher import DynamoSglangPublisher
@@ -29,7 +29,14 @@ class EmbeddingWorkerHandler(BaseWorkerHandler):
        logging.info("Engine shutdown")
        super().cleanup()
-    async def generate(self, request: dict):
+    async def generate(self, request: dict, context: Context):
+        """
+        Generate embeddings for the given input.
+        Args:
+            request: Embedding request dictionary.
+            context: Context object for cancellation handling.
+        """
        logging.debug(f"Embedding request: {request}")
        # Parse the embedding request - should only receive EmbeddingRequest format

--- a/components/src/dynamo/sglang/request_handlers/handler_base.py
+++ b/components/src/dynamo/sglang/request_handlers/handler_base.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+import asyncio
+import logging
 import random
 import socket
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional, Tuple
+from contextlib import asynccontextmanager
+from typing import Any, AsyncGenerator, Dict, Optional, Tuple
 import sglang as sgl
 from sglang.srt.utils import get_local_ip_auto
-from dynamo._core import Client, Component
+from dynamo._core import Client, Component, Context
 from dynamo.sglang.args import Config
 from dynamo.sglang.publisher import DynamoSglangPublisher
@@ -48,11 +51,12 @@ class BaseWorkerHandler(ABC):
        self.skip_tokenizer_init = config.server_args.skip_tokenizer_init
    @abstractmethod
-    async def generate(self, request: Dict[str, Any]):
+    async def generate(self, request: Dict[str, Any], context: Context):
        """Generate response from request.
        Args:
            request: Request dict with input and parameters.
+            context: Context object for cancellation handling.
        Yields:
            Response data (format varies by handler implementation).
@@ -112,3 +116,107 @@ class BaseWorkerHandler(ABC):
            bootstrap_host = get_local_ip_auto()
        return bootstrap_host, bootstrap_port
+    async def _handle_cancellation(
+        self, request_id_future: asyncio.Future, context: Context
+    ):
+        """Background task to handle cancellation by monitoring context state.
+        Args:
+            request_id_future: Future that will be set with the SGLang request ID
+                              when the first response arrives.
+            context: Context object for cancellation handling.
+        """
+        try:
+            logging.debug(f"Cancellation monitor started for Context: {context.id()}")
+            # Always wait for the request ID to ensure we can abort the request
+            sglang_request_id = await request_id_future
+            logging.debug(
+                f"Cancellation monitor received SGLang Request ID {sglang_request_id} for Context: {context.id()}"
+            )
+            logging.debug(f"Request ID future cancelled for Context: {context.id()}")
+            await context.async_killed_or_stopped()
+            logging.info(
+                f"Cancellation signal received for SGLang Request ID {sglang_request_id}, Context: {context.id()}"
+            )
+            # Call abort_request on the tokenizer_manager through the engine
+            if (
+                hasattr(self.engine, "tokenizer_manager")
+                and self.engine.tokenizer_manager
+            ):
+                logging.info(
+                    f"Calling SGLang abort_request for Request ID {sglang_request_id}"
+                )
+                self.engine.tokenizer_manager.abort_request(
+                    rid=sglang_request_id, abort_all=False
+                )
+                logging.info(f"Aborted Request ID: {context.id()}")
+            else:
+                logging.error(
+                    f"SGLang tokenizer_manager not found for abort request: {context.id()}"
+                )
+        except asyncio.CancelledError:
+            # Task was cancelled, which is expected when generation completes
+            request_id = "unknown"
+            if request_id_future.done() and not request_id_future.cancelled():
+                try:
+                    request_id = request_id_future.result()
+                except Exception:
+                    pass
+            logging.debug(
+                f"Cancellation monitor task cancelled for SGLang Request ID {request_id}, Context: {context.id()}"
+            )
+            raise
+    @asynccontextmanager
+    async def _cancellation_monitor(
+        self, request_id_future: asyncio.Future, context: Context
+    ) -> AsyncGenerator[asyncio.Task, None]:
+        """
+        Context manager for monitoring request cancellation.
+        Automatically creates a background task to monitor for cancellation and
+        cleans it up when the context exits.
+        Args:
+            request_id_future: Future that will be set with the SGLang request ID
+                              when the first response arrives.
+            context: Context object for cancellation handling
+        Yields:
+            asyncio.Task: The cancellation monitoring task being managed
+        """
+        logging.info(f"Creating cancellation monitor task for Context: {context.id()}")
+        # Start the cancellation monitoring task
+        cancellation_task = asyncio.create_task(
+            self._handle_cancellation(request_id_future, context)
+        )
+        try:
+            yield cancellation_task
+        finally:
+            # Clean up the background cancellation task
+            request_id = "unknown"
+            if request_id_future.done() and not request_id_future.cancelled():
+                try:
+                    request_id = request_id_future.result()
+                except Exception:
+                    pass
+            if not cancellation_task.done():
+                logging.debug(
+                    f"Cancelling cancellation monitor task for SGLang Request ID {request_id}, Context: {context.id()}"
+                )
+                cancellation_task.cancel()
+                try:
+                    await cancellation_task
+                except asyncio.CancelledError:
+                    pass
+            else:
+                logging.debug(
+                    f"Cancellation monitor task already completed for SGLang Request ID {request_id}, Context: {context.id()}"
+                )
--- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+import asyncio
 import logging
 import time
 from typing import Any, AsyncGenerator, Dict, Optional
 import sglang as sgl
-from dynamo._core import Client, Component
+from dynamo._core import Client, Component, Context
 from dynamo.sglang.args import Config, DisaggregationMode
 from dynamo.sglang.protocol import DisaggPreprocessedRequest
 from dynamo.sglang.publisher import DynamoSglangPublisher
@@ -96,12 +97,13 @@ class DecodeWorkerHandler(BaseWorkerHandler):
        return {k: v for k, v in param_mapping.items() if v is not None}
    async def generate(
-        self, request: Dict[str, Any]
+        self, request: Dict[str, Any], context: Context
    ) -> AsyncGenerator[Dict[str, Any], None]:
        """Generate response in aggregated or disaggregated mode.
        Args:
            request: Request dict with input and sampling parameters.
+            context: Context object for cancellation handling.
        Yields:
            Response dicts with token_ids or OpenAI-formatted chunks.
@@ -109,6 +111,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
        Raises:
            RuntimeError: If no bootstrap info received from prefill worker.
        """
+        logging.debug(f"New Request ID: {context.id()}")
        sampling_params = self._build_sampling_params(request)
        input_param = self._get_input_param(request)
@@ -139,7 +142,8 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                    DisaggPreprocessedRequest(
                        request=request,
                        sampling_params=sampling_params,
-                    ).model_dump()
+                    ).model_dump(),
+                    context=context,
                )
            bootstrap_info = None
@@ -160,10 +164,10 @@ class DecodeWorkerHandler(BaseWorkerHandler):
            )
            if self.skip_tokenizer_init:
-                async for out in self._process_token_stream(decode):
+                async for out in self._process_token_stream(decode, context):
                    yield out
            else:
-                async for out in self._process_text_stream(decode):
+                async for out in self._process_text_stream(decode, context):
                    yield out
        else:
            agg = await self.engine.async_generate(
@@ -172,76 +176,116 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                stream=True,
            )
            if self.skip_tokenizer_init:
-                async for out in self._process_token_stream(agg):
+                async for out in self._process_token_stream(agg, context):
                    yield out
            else:
-                async for out in self._process_text_stream(agg):
+                async for out in self._process_text_stream(agg, context):
                    yield out
    async def _process_token_stream(
-        self, stream_source: AsyncGenerator[Dict[str, Any], None]
+        self,
+        stream_source: AsyncGenerator[Dict[str, Any], None],
+        context: Context,
    ) -> AsyncGenerator[Dict[str, Any], None]:
        """Process token-based stream output.
        Args:
            stream_source: Async generator from engine.async_generate.
+            context: Context object for cancellation handling.
        Yields:
            Dict with token_ids and optional finish_reason.
        """
        num_output_tokens_so_far = 0
-        async for res in stream_source:
+        # Use Future pattern for request ID - will be set when first response arrives
-            out = {}
+        request_id_future = asyncio.Future()
-            finish_reason = res["meta_info"]["finish_reason"]
+        async with self._cancellation_monitor(request_id_future, context):
-            if finish_reason:
+            async for res in stream_source:
-                out["finish_reason"] = finish_reason["type"]
+                # Extract SGLang request ID from the first response and set the future
+                if not request_id_future.done():
-            output_ids = res.get("output_ids", [])
+                    meta_info = res.get("meta_info", {})
-            # If request is not finished yet, but there are no outputs, return an error.
+                    sglang_request_id = meta_info.get("id")
-            if not output_ids and not finish_reason:
+                    if sglang_request_id:
-                yield {"finish_reason": "error", "token_ids": []}
+                        request_id_future.set_result(sglang_request_id)
-                break
+                        logging.debug(f"New SGLang Request ID: {sglang_request_id}")
-            next_total_toks = len(output_ids)
+                # Check cancellation before yielding to allow proper cleanup.
-            out["token_ids"] = output_ids[num_output_tokens_so_far:]
+                # This lets SGLang proceed to the second token generation, which will
-            num_output_tokens_so_far = next_total_toks
+                # async context switch and allow the abort monitor to signal cancellation.
-            yield out
+                # The loop should exit by itself when context.is_stopped() returns True.
+                out = {}
+                finish_reason = res["meta_info"]["finish_reason"]
+                if finish_reason:
+                    out["finish_reason"] = finish_reason["type"]
+                output_ids = res.get("output_ids", [])
+                # If request is not finished yet, but there are no outputs, return an error.
+                if not output_ids and not finish_reason:
+                    if not context.is_stopped():
+                        yield {"finish_reason": "error", "token_ids": []}
+                    break
+                next_total_toks = len(output_ids)
+                out["token_ids"] = output_ids[num_output_tokens_so_far:]
+                num_output_tokens_so_far = next_total_toks
+                if not context.is_stopped():
+                    yield out
    async def _process_text_stream(
-        self, stream_source: AsyncGenerator[Dict[str, Any], None]
+        self,
+        stream_source: AsyncGenerator[Dict[str, Any], None],
+        context: Context,
    ) -> AsyncGenerator[Dict[str, Any], None]:
        """Process text-based stream output in OpenAI format.
        Args:
            stream_source: Async generator from engine.async_generate.
+            context: Context object for cancellation handling.
        Yields:
            OpenAI-formatted chat completion chunk dicts.
        """
        count = 0
-        async for res in stream_source:
+        # Use Future pattern for request ID - will be set when first response arrives
-            index = res.get("index", 0)
+        request_id_future = asyncio.Future()
-            text = res.get("text", "")
+        async with self._cancellation_monitor(request_id_future, context):
+            async for res in stream_source:
-            finish_reason = res["meta_info"]["finish_reason"]
+                # Extract SGLang request ID from the first response and set the future
-            finish_reason_type = finish_reason["type"] if finish_reason else None
+                if not request_id_future.done():
-            next_count = len(text)
+                    meta_info = res.get("meta_info", {})
-            delta = text[count:]
+                    sglang_request_id = meta_info.get("id")
+                    if sglang_request_id:
-            choice_data = {
+                        request_id_future.set_result(sglang_request_id)
-                "index": index,
+                        logging.debug(f"New SGLang Request ID: {sglang_request_id}")
-                "delta": {"role": "assistant", "content": delta},
-                "finish_reason": finish_reason_type,
+                # Check cancellation before yielding to allow proper cleanup.
-            }
+                # This lets SGLang proceed to the second token generation, which will
+                # async context switch and allow the abort monitor to signal cancellation.
-            response = {
+                # The loop should exit by itself when context.is_stopped() returns True.
-                "id": res["meta_info"]["id"],
-                "created": int(time.time()),
+                index = res.get("index", 0)
-                "choices": [choice_data],
+                text = res.get("text", "")
-                "model": self.config.server_args.served_model_name,
-                "object": "chat.completion.chunk",
+                finish_reason = res["meta_info"]["finish_reason"]
-            }
+                finish_reason_type = finish_reason["type"] if finish_reason else None
-            yield response
+                next_count = len(text)
-            count = next_count
+                delta = text[count:]
+                choice_data = {
+                    "index": index,
+                    "delta": {"role": "assistant", "content": delta},
+                    "finish_reason": finish_reason_type,
+                }
+                response = {
+                    "id": res["meta_info"]["id"],
+                    "created": int(time.time()),
+                    "choices": [choice_data],
+                    "model": self.config.server_args.served_model_name,
+                    "object": "chat.completion.chunk",
+                }
+                if not context.is_stopped():
+                    yield response
+                count = next_count
--- a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
@@ -7,7 +7,7 @@ from typing import Any, AsyncGenerator, Dict
 import sglang as sgl
-from dynamo._core import Component
+from dynamo._core import Component, Context
 from dynamo.sglang.args import Config
 from dynamo.sglang.publisher import DynamoSglangPublisher
 from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
@@ -34,27 +34,36 @@ class PrefillWorkerHandler(BaseWorkerHandler):
        self.engine = engine
        self.bootstrap_host, self.bootstrap_port = self._get_bootstrap_info(self.engine)
        super().__init__(component, engine, config, publisher)
+        self._consume_tasks = set()
        logging.info(
            f"Prefill worker handler initialized - bootstrap host: {self.bootstrap_host}, bootstrap port: {self.bootstrap_port}"
        )
    def cleanup(self) -> None:
        """Shutdown the prefill engine and cleanup resources."""
+        # Cancel all pending consume tasks
+        for task in self._consume_tasks:
+            if not task.done():
+                task.cancel()
+        self._consume_tasks.clear()
        self.engine.shutdown()
        logging.info("Prefill engine shutdown")
        super().cleanup()
    async def generate(
-        self, request: Dict[str, Any]
+        self, request: Dict[str, Any], context: Context
    ) -> AsyncGenerator[Dict[str, Any], None]:
        """Generate prefill output and provide bootstrap info for decode worker.
        Args:
            request: Request dict with 'request' and 'sampling_params' keys.
+            context: Context object for cancellation handling.
        Yields:
            Bootstrap info dict with host, port, and room for decode worker connection.
        """
+        logging.debug(f"New Request ID: {context.id()}")
        bootstrap_room = self._generate_bootstrap_room()
        bootstrap_info = {
@@ -76,13 +85,31 @@ class PrefillWorkerHandler(BaseWorkerHandler):
            bootstrap_room=bootstrap_room,
        )
-        asyncio.create_task(self._consume_results(results))
+        task = asyncio.create_task(self._consume_results(results, context))
+        self._consume_tasks.add(task)
+        task.add_done_callback(self._consume_tasks.discard)
-    async def _consume_results(self, results: AsyncGenerator[Any, None]) -> None:
+    async def _consume_results(
+        self, results: AsyncGenerator[Any, None], context: Context
+    ) -> None:
        """Consume async generator results without processing.
        Args:
            results: Async generator from engine.async_generate.
+            context: Context object for cancellation handling.
        """
-        async for _ in results:
+        # Use Future pattern for request ID - will be set when first response arrives
-            pass
+        request_id_future = asyncio.Future()
+        async with self._cancellation_monitor(request_id_future, context):
+            async for res in results:
+                # Extract SGLang request ID from the first response and set the future
+                if not request_id_future.done():
+                    meta_info = res.get("meta_info", {})
+                    sglang_request_id = meta_info.get("id")
+                    if sglang_request_id:
+                        request_id_future.set_result(sglang_request_id)
+                        logging.debug(f"New Prefill Request ID: {sglang_request_id}")
+                # Note: No explicit cancellation checks needed here.
+                # When abort_request is called by the cancellation monitor,
+                # SGLang will terminate this async generator automatically.
--- a/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
@@ -9,7 +9,7 @@ from sglang.srt.parser.conversation import chat_templates
 from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
 import dynamo.nixl_connect as connect
-from dynamo._core import Client, Component
+from dynamo._core import Client, Component, Context
 from dynamo.runtime import DistributedRuntime
 from dynamo.sglang.args import Config
 from dynamo.sglang.multimodal_utils import ImageLoader, encode_image_embeddings
@@ -90,7 +90,16 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
    def cleanup(self):
        pass
-    async def generate(self, request: SglangMultimodalRequest) -> AsyncIterator[str]:
+    async def generate(
+        self, request: SglangMultimodalRequest, context: Context
+    ) -> AsyncIterator[str]:
+        """
+        Generate precomputed embeddings for multimodal input.
+        Args:
+            request: Multimodal request with image/video data.
+            context: Context object for cancellation handling.
+        """
        if not isinstance(request, SglangMultimodalRequest):
            if isinstance(request, str):
                request = SglangMultimodalRequest.model_validate_json(request)

--- a/components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py
@@ -9,7 +9,7 @@ from typing import Any, Dict
 from transformers import AutoTokenizer
-from dynamo._core import Client, Component
+from dynamo._core import Client, Component, Context
 from dynamo.sglang.args import Config
 from dynamo.sglang.multimodal_utils import (
    multimodal_request_to_sglang,
@@ -54,7 +54,14 @@ class MultimodalProcessorHandler(BaseWorkerHandler):
    def cleanup(self):
        pass
-    async def generate(self, raw_request: MultiModalRequest):
+    async def generate(self, raw_request: MultiModalRequest, context: Context):
+        """
+        Process multimodal request and forward to encode worker.
+        Args:
+            raw_request: Raw multimodal request to process.
+            context: Context object for cancellation handling.
+        """
        if not isinstance(raw_request, MultiModalRequest):
            # If the request is not MultiModalRequest, convert it to MultiModalRequest
            raw_request = MultiModalRequest.model_validate(raw_request)

--- a/components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py
@@ -10,7 +10,7 @@ import sglang as sgl
 import torch
 import dynamo.nixl_connect as connect
-from dynamo._core import Client, Component
+from dynamo._core import Client, Component, Context
 from dynamo.sglang.args import Config, DisaggregationMode
 from dynamo.sglang.protocol import (
    DisaggSglangMultimodalRequest,
@@ -275,10 +275,16 @@ class MultimodalWorkerHandler(BaseWorkerHandler):
                request = SglangMultimodalRequest.model_validate(request)
        return request
-    async def generate(self, request: SglangMultimodalRequest) -> AsyncIterator[str]:
+    async def generate(
+        self, request: SglangMultimodalRequest, context: Context
+    ) -> AsyncIterator[str]:
        """
        Generate response using SGLang with multimodal data
        Handles both aggregated and disaggregated modes (following regular SGLang DecodeWorkerHandler pattern)
+        Args:
+            request: Multimodal request with input and parameters.
+            context: Context object for cancellation handling.
        """
        try:
            request = self._validate_and_parse_request(request)
@@ -429,10 +435,14 @@ class MultimodalPrefillWorkerHandler(BaseWorkerHandler):
        await self.embeddings_processor.initialize()
    async def generate(
-        self, disagg_request: DisaggSglangMultimodalRequest
+        self, disagg_request: DisaggSglangMultimodalRequest, context: Context
    ) -> AsyncIterator[str]:
        """
        Handle prefill phase: process multimodal input and provide bootstrap info
+        Args:
+            disagg_request: Disaggregated multimodal request.
+            context: Context object for cancellation handling.
        """
        bootstrap_room = None
        try:

--- a/tests/fault_tolerance/cancellation/test_sglang.py
+++ b/tests/fault_tolerance/cancellation/test_sglang.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+import shutil
+import time
+import pytest
+from tests.fault_tolerance.cancellation.utils import (
+    DynamoFrontendProcess,
+    poll_for_pattern,
+    read_streaming_responses,
+    send_cancellable_request,
+)
+from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
+from tests.utils.engine_process import FRONTEND_PORT
+from tests.utils.managed_process import ManagedProcess
+from tests.utils.payloads import check_health_generate, check_models_api
+logger = logging.getLogger(__name__)
+class DynamoWorkerProcess(ManagedProcess):
+    """Process manager for Dynamo worker with SGLang backend"""
+    def __init__(self, request, mode: str = "agg"):
+        """
+        Initialize SGLang worker process.
+        Args:
+            request: pytest request object
+            mode: One of "agg", "prefill", "decode"
+        """
+        command = [
+            "python3",
+            "-m",
+            "dynamo.sglang",
+            "--model-path",
+            FAULT_TOLERANCE_MODEL_NAME,
+            "--served-model-name",
+            FAULT_TOLERANCE_MODEL_NAME,
+            "--page-size",
+            "16",
+            "--tp",
+            "1",
+            "--trust-remote-code",
+        ]
+        # Add mode-specific arguments
+        if mode == "agg":
+            # Aggregated mode - add skip-tokenizer-init like the serve test
+            command.append("--skip-tokenizer-init")
+        else:
+            # Disaggregated mode - add disaggregation arguments like disagg.sh
+            command.extend(
+                [
+                    "--disaggregation-mode",
+                    mode,
+                    "--disaggregation-bootstrap-port",
+                    "12345",
+                    "--host",
+                    "0.0.0.0",
+                    "--disaggregation-transfer-backend",
+                    "nixl",
+                ]
+            )
+        health_check_urls = [
+            (f"http://localhost:{FRONTEND_PORT}/v1/models", check_models_api),
+            (f"http://localhost:{FRONTEND_PORT}/health", check_health_generate),
+        ]
+        # Set port based on worker type
+        if mode == "prefill":
+            port = "8082"
+            health_check_urls = [(f"http://localhost:{port}/health", self.is_ready)]
+        elif mode == "decode":
+            port = "8081"
+            health_check_urls = [(f"http://localhost:{port}/health", self.is_ready)]
+        else:  # agg (aggregated mode)
+            port = "8081"
+        # Set debug logging environment
+        env = os.environ.copy()
+        env["DYN_LOG"] = "debug"
+        env["DYN_SYSTEM_ENABLED"] = "true"
+        env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
+        env["DYN_SYSTEM_PORT"] = port
+        # Set GPU assignment for disaggregated mode (like disagg.sh)
+        if mode == "decode":
+            env["CUDA_VISIBLE_DEVICES"] = "1"  # Use GPU 1 for decode worker
+        elif mode == "prefill":
+            env["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU 0 for prefill worker
+        # For agg (aggregated) mode, use default GPU assignment
+        # Set log directory based on worker type
+        log_dir = f"{request.node.name}_{mode}_worker"
+        # Clean up any existing log directory from previous runs
+        try:
+            shutil.rmtree(log_dir)
+            logger.info(f"Cleaned up existing log directory: {log_dir}")
+        except FileNotFoundError:
+            # Directory doesn't exist, which is fine
+            pass
+        super().__init__(
+            command=command,
+            env=env,
+            health_check_urls=health_check_urls,
+            timeout=300,
+            display_output=True,
+            terminate_existing=False,
+            # Ensure any orphaned SGLang engine cores or child helpers are cleaned up
+            stragglers=[
+                "SGLANG:EngineCore",
+            ],
+            straggler_commands=[
+                "-m dynamo.sglang",
+            ],
+            log_dir=log_dir,
+        )
+        self.mode = mode
+    def get_pid(self):
+        """Get the PID of the worker process"""
+        return self.proc.pid if self.proc else None
+    def is_ready(self, response) -> bool:
+        """Check the health of the worker process"""
+        try:
+            data = response.json()
+            if data.get("status") == "ready":
+                logger.info(f"{self.mode.capitalize()} worker status is ready")
+                return True
+            logger.warning(
+                f"{self.mode.capitalize()} worker status is not ready: {data.get('status')}"
+            )
+        except ValueError:
+            logger.warning(
+                f"{self.mode.capitalize()} worker health response is not valid JSON"
+            )
+        return False
+@pytest.mark.e2e
+@pytest.mark.sglang
+@pytest.mark.gpu_1
+@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.xfail(strict=False)
+def test_request_cancellation_sglang_aggregated(
+    request, runtime_services, predownload_models
+):
+    """
+    End-to-end test for request cancellation functionality in aggregated mode.
+    This test verifies that when a request is cancelled by the client,
+    the system properly handles the cancellation and cleans up resources
+    on the worker side in aggregated (agg) mode.
+    TODO: Test is currently flaky/failing due to SGLang limitations with prefill cancellation.
+    See: https://github.com/sgl-project/sglang/issues/11139
+    """
+    logger.info("Sanity check if latest test is getting executed")
+    # Step 1: Start the frontend
+    with DynamoFrontendProcess(request) as frontend:
+        logger.info("Frontend started successfully")
+        # Step 2: Start an aggregated worker
+        with DynamoWorkerProcess(request, mode="agg") as worker:
+            logger.info(f"Aggregated Worker PID: {worker.get_pid()}")
+            # TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
+            time.sleep(2)
+            # Step 3: Test request cancellation with polling approach
+            frontend_log_offset, worker_log_offset = 0, 0
+            test_scenarios = [
+                ("completion", "Completion request cancellation"),
+                ("chat_completion", "Chat completion request cancellation"),
+                (
+                    "chat_completion_stream",
+                    "Chat completion stream request cancellation",
+                ),
+            ]
+            for request_type, description in test_scenarios:
+                logger.info(f"Testing {description.lower()}...")
+                # Send the request (non-blocking)
+                cancellable_req = send_cancellable_request(request_type)
+                # Poll for "New Request ID" pattern (Dynamo context ID)
+                request_id, worker_log_offset = poll_for_pattern(
+                    process=worker,
+                    pattern="New Request ID: ",
+                    log_offset=worker_log_offset,
+                    match_type="contains",
+                )
+                # For streaming, read one response first to trigger SGLang ID logging
+                if request_type == "chat_completion_stream":
+                    read_streaming_responses(cancellable_req, expected_count=1)
+                # Wait for SGLang to actually start processing (get SGLang request ID)
+                _, worker_log_offset = poll_for_pattern(
+                    process=worker,
+                    pattern="New SGLang Request ID: ",
+                    log_offset=worker_log_offset,
+                    match_type="contains",
+                )
+                # Now we know SGLang has the request, cancel it
+                cancellable_req.cancel()
+                logger.info(f"Cancelled request ID: {request_id}")
+                # Poll for "Aborted Request ID" with matching ID
+                _, worker_log_offset = poll_for_pattern(
+                    process=worker,
+                    pattern=f"Aborted Request ID: {request_id}",
+                    log_offset=worker_log_offset,
+                    max_wait_ms=2000,
+                )
+                # Verify frontend log has kill message
+                _, frontend_log_offset = poll_for_pattern(
+                    process=frontend,
+                    pattern="issued control message Kill to sender",
+                    log_offset=frontend_log_offset,
+                )
+                logger.info(f"{description} detected successfully")
+@pytest.mark.e2e
+@pytest.mark.sglang
+@pytest.mark.gpu_2
+@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+def test_request_cancellation_sglang_decode_cancel(
+    request, runtime_services, predownload_models
+):
+    """
+    End-to-end test for request cancellation during remote decode phase.
+    This test verifies that when a request is cancelled by the client during the remote decode phase,
+    the system properly handles the cancellation and cleans up resources
+    on both the prefill and decode workers in a disaggregated setup.
+    Note: This test requires 2 GPUs to run decode and prefill workers on separate GPUs.
+    """
+    # Step 1: Start the frontend
+    with DynamoFrontendProcess(request) as frontend:
+        logger.info("Frontend started successfully")
+        # Step 2: Start the decode worker
+        with DynamoWorkerProcess(request, mode="decode") as decode_worker:
+            logger.info(f"Decode Worker PID: {decode_worker.get_pid()}")
+            # Step 3: Start the prefill worker
+            with DynamoWorkerProcess(request, mode="prefill") as prefill_worker:
+                logger.info(f"Prefill Worker PID: {prefill_worker.get_pid()}")
+                # TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
+                time.sleep(2)
+                # Step 4: Test request cancellation during remote decode phase
+                logger.info(
+                    "Testing chat completion stream request cancellation during remote decode phase..."
+                )
+                # Send streaming request (non-blocking)
+                cancellable_req = send_cancellable_request("chat_completion_stream")
+                # Poll for "New Request ID" pattern in decode worker (Dynamo context ID)
+                request_id, decode_log_offset = poll_for_pattern(
+                    process=decode_worker,
+                    pattern="New Request ID: ",
+                    match_type="contains",
+                )
+                # Verify same request ID reached prefill worker
+                _, prefill_log_offset = poll_for_pattern(
+                    process=prefill_worker,
+                    pattern=f"New Request ID: {request_id}",
+                )
+                # Read one response first to trigger SGLang ID logging in decode worker
+                read_streaming_responses(cancellable_req, expected_count=1)
+                # Wait for SGLang to start processing in decode worker
+                _, decode_log_offset = poll_for_pattern(
+                    process=decode_worker,
+                    pattern="New SGLang Request ID: ",
+                    log_offset=decode_log_offset,
+                    match_type="contains",
+                )
+                # Now we know SGLang has the request in decode worker, cancel it
+                cancellable_req.cancel()
+                logger.info(f"Cancelled request ID: {request_id}")
+                # Poll for "Aborted Request ID" in decode worker
+                _, decode_log_offset = poll_for_pattern(
+                    process=decode_worker,
+                    pattern=f"Aborted Request ID: {request_id}",
+                    log_offset=decode_log_offset,
+                )
+                # Verify frontend log has kill message
+                _, frontend_log_offset = poll_for_pattern(
+                    process=frontend,
+                    pattern="issued control message Kill to sender",
+                )
+                logger.info(
+                    "Chat completion stream cancellation in decode phase detected successfully"
+                )
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -116,7 +116,7 @@ sglang_configs = {
                # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
                # so we need something consistently found in the response, or a different
                # approach to validation for this test to be stable.
-                expected_response=["OUT OF SERVICE"],
+                expected_response=["image"],
                temperature=0.0,
            )
        ],