feat: Request Cancellation TRT-LLM (#3193)

Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com>

feat: Request Cancellation TRT-LLM (#3193)
Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com>
77e66ae5 · Jacky · GitHub · a13c4cb6 · 77e66ae5 · 77e66ae5
Unverified Commit 77e66ae5 authored Sep 26, 2025 by Jacky Committed by GitHub Sep 26, 2025
6 changed files
--- a/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py
+++ b/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -13,17 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import copy
 import logging
 import os
+from contextlib import asynccontextmanager
 from dataclasses import asdict, dataclass
 from enum import Enum
-from typing import Optional, Union
+from typing import Any, AsyncGenerator, Optional, Union
 import torch
 from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
 from tensorrt_llm.llmapi.llm import SamplingParams
+from dynamo._core import Context
 from dynamo.logits_processing.examples import HelloWorldLogitsProcessor
 from dynamo.nixl_connect import Connector
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -100,14 +103,71 @@ class HandlerBase:
                result["finish_reason"] == "stop" or result["finish_reason"] == "error"
            )
+    async def _handle_cancellation(self, generation_result: Any, context: Context):
+        """Background task to handle cancellation by monitoring context state."""
+        try:
+            # Wait asynchronously for cancellation signal instead of polling
+            await context.async_killed_or_stopped()
+            # Call abort_request on the executor through the LLM instance
+            if hasattr(self.engine.llm, "_executor") and self.engine.llm._executor:
+                # Get the internal request ID from the generation result
+                internal_request_id = getattr(generation_result, "request_id", None)
+                if internal_request_id is not None:
+                    # TODO: Can this be an official abort method in TRT-LLM?
+                    self.engine.llm._executor.abort_request(internal_request_id)
+                    logging.debug(f"Aborted Request ID: {context.id()}")
+                else:
+                    logging.error(
+                        f"Could not retrieve internal request ID for abort: {context.id()}"
+                    )
+            else:
+                logging.error(
+                    f"TensorRT-LLM executor not found for abort request: {context.id()}"
+                )
+        except asyncio.CancelledError:
+            # Task was cancelled, which is expected when generation completes
+            pass
+    @asynccontextmanager
+    async def _cancellation_monitor(
+        self, generation_result: Any, context: Context
+    ) -> AsyncGenerator[asyncio.Task, None]:
+        """
+        Context manager for monitoring request cancellation.
+        Automatically creates a background task to monitor for cancellation and
+        cleans it up when the context exits.
+        Yields:
+            asyncio.Task: The cancellation monitoring task
+        """
+        cancellation_task = asyncio.create_task(
+            self._handle_cancellation(generation_result, context)
+        )
+        try:
+            yield cancellation_task
+        finally:
+            # Clean up the background cancellation task
+            if not cancellation_task.done():
+                cancellation_task.cancel()
+                try:
+                    await cancellation_task
+                except asyncio.CancelledError:
+                    pass
    async def generate_locally(
-        self, request: dict, embeddings: Optional[Union[torch.Tensor, dict]] = None
+        self,
+        request: dict,
+        context: Context,
+        embeddings: Optional[Union[torch.Tensor, dict]] = None,
    ):
        """
        Generate responses based on the disaggregation mode in the request.
        Args:
            request: The request dictionary containing generation parameters
+            context: Context object for cancellation handling
            embeddings: Optional tensor or dict containing embeddings for multimodal processing
        """
        logging.debug(f"Request: {request}")
@@ -192,50 +252,57 @@ class HandlerBase:
            sampling_params.logits_processor = adapters
        # NEW: Updated engine call to include multimodal data
-        async for res in self.engine.llm.generate_async(
+        generation_result = self.engine.llm.generate_async(
            inputs=processed_input,  # Use the correctly extracted inputs
            sampling_params=sampling_params,
            disaggregated_params=disaggregated_params,
            streaming=streaming,
-        ):
+        )
-            # TRTLLM engine needs to start generating tokens first before stats
-            # can be retrieved.
+        # Use the context manager to handle cancellation monitoring
-            if self.first_generation and self.publisher:
+        async with self._cancellation_monitor(generation_result, context):
-                self.publisher.start()
+            async for res in generation_result:
-                self.first_generation = False
+                # TRTLLM engine needs to start generating tokens first before stats
+                # can be retrieved.
-            # Upon completion, send a final chunk with "stop" as the finish reason.
+                if self.first_generation and self.publisher:
-            # This signals to the client that the stream has ended.
+                    self.publisher.start()
-            if res.finished and self.disaggregation_mode != DisaggregationMode.PREFILL:
+                    self.first_generation = False
+                # Upon completion, send a final chunk with "stop" as the finish reason.
+                # This signals to the client that the stream has ended.
+                if (
+                    res.finished
+                    and self.disaggregation_mode != DisaggregationMode.PREFILL
+                ):
+                    if self.multimodal_processor:
+                        final_out = self.multimodal_processor.get_stop_response(
+                            request_id, model_name
+                        )
+                        yield final_out
+                if not res.outputs:
+                    yield {"finish_reason": "error", "token_ids": []}
+                    break
+                output = res.outputs[0]
+                # The engine returns all tokens generated so far. We must calculate the new
+                # tokens generated in this iteration to create the "delta".
+                next_total_toks = len(output.token_ids)
                if self.multimodal_processor:
-                    final_out = self.multimodal_processor.get_stop_response(
+                    out = self.multimodal_processor.create_response_chunk(
-                        request_id, model_name
+                        output, num_output_tokens_so_far, request_id, model_name
                    )
-                    yield final_out
+                else:
+                    out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
-            if not res.outputs:
+                if output.finish_reason:
-                yield {"finish_reason": "error", "token_ids": []}
+                    out["finish_reason"] = output.finish_reason
-                break
+                if output.stop_reason:
+                    out["stop_reason"] = output.stop_reason
-            output = res.outputs[0]
+                if self.disaggregation_mode == DisaggregationMode.PREFILL:
-            # The engine returns all tokens generated so far. We must calculate the new
+                    # Return the disaggregated params only when operating in prefill mode.
-            # tokens generated in this iteration to create the "delta".
+                    out["disaggregated_params"] = asdict(
-            next_total_toks = len(output.token_ids)
+                        DisaggregatedParamsCodec.encode(output.disaggregated_params)
-            if self.multimodal_processor:
+                    )
-                out = self.multimodal_processor.create_response_chunk(
+                # Yield the chunk to the client and update the token count for the next iteration.
-                    output, num_output_tokens_so_far, request_id, model_name
+                yield out
-                )
+                num_output_tokens_so_far = next_total_toks
-            else:
-                out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
-            if output.finish_reason:
-                out["finish_reason"] = output.finish_reason
-            if output.stop_reason:
-                out["stop_reason"] = output.stop_reason
-            if self.disaggregation_mode == DisaggregationMode.PREFILL:
-                # Return the disaggregated params only when operating in prefill mode.
-                out["disaggregated_params"] = asdict(
-                    DisaggregatedParamsCodec.encode(output.disaggregated_params)
-                )
-            # Yield the chunk to the client and update the token count for the next iteration.
-            yield out
-            num_output_tokens_so_far = next_total_toks
--- a/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handlers.py
+++ b/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handlers.py
@@ -4,6 +4,7 @@
 import copy
 import logging
+from dynamo._core import Context
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.trtllm.encode_helper import EncodeHelper
 from dynamo.trtllm.request_handlers.handler_base import (
@@ -66,9 +67,10 @@ class AggregatedHandler(HandlerBase):
    def __init__(self, config: RequestHandlerConfig):
        super().__init__(config)
-    async def generate(self, request: dict):
+    async def generate(self, request: dict, context: Context):
+        logging.debug(f"New Request ID: {context.id()}")
        # Implement all steps locally.
-        async for res in self.generate_locally(request):
+        async for res in self.generate_locally(request, context):
            yield res
@@ -80,7 +82,8 @@ class EncodeHandler(HandlerBase):
    def __init__(self, config: RequestHandlerConfig):
        super().__init__(config)
-    async def generate(self, request: dict):
+    async def generate(self, request: dict, context: Context):
+        logging.debug(f"New Request ID: {context.id()}")
        if self.connector:
            # Use helper method to process embedding request
            async for response in EncodeHelper.process_embedding_request(
@@ -122,11 +125,12 @@ class PrefillHandler(HandlerBase):
            encode_response, self.connector
        )
-    async def remote_decode(self, request: dict):
+    async def remote_decode(self, request: dict, context: Context):
-        async for res in await self.next_client.round_robin(request):
+        async for res in await self.next_client.round_robin(request, context=context):
            yield res.data()
-    async def generate(self, request: dict):
+    async def generate(self, request: dict, context: Context):
+        logging.debug(f"New Request ID: {context.id()}")
        logging.debug(f"PrefillHandler.generate received request: {request}")
        embeddings_tensor = None
@@ -145,12 +149,18 @@ class PrefillHandler(HandlerBase):
        prefill_request = copy.deepcopy(request)
        prefill_response = None
        response_count = 0
-        async for res in self.generate_locally(prefill_request, embeddings_tensor):
+        async for res in self.generate_locally(
+            prefill_request, context, embeddings_tensor
+        ):
            prefill_response = res
            response_count += 1
            if response_count > 1:
                raise ValueError("Prefill response should be generated only once.")
+        if context.is_stopped() or context.is_killed():
+            # Local generate abort monitor will print debug log, so only returning here.
+            return
        if (
            self.disaggregation_strategy == DisaggregationStrategy.PREFILL_FIRST
            and not self.check_error(prefill_response)
@@ -161,8 +171,12 @@ class PrefillHandler(HandlerBase):
                request["disaggregated_params"] = prefill_response[
                    "disaggregated_params"
                ]
-            async for res in self.remote_decode(request):
+            async for res in self.remote_decode(request, context):
                yield res
+            if context.is_stopped() or context.is_killed():
+                logging.debug(f"Aborted Remote Request ID: {context.id()}")
+                return
        else:
            # Return response to the decode handler.
            yield prefill_response
@@ -176,11 +190,12 @@ class DecodeHandler(HandlerBase):
    def __init__(self, config: RequestHandlerConfig):
        super().__init__(config)
-    async def remote_prefill(self, request: dict):
+    async def remote_prefill(self, request: dict, context: Context):
-        async for res in await self.next_client.round_robin(request):
+        async for res in await self.next_client.round_robin(request, context=context):
            yield res
-    async def generate(self, request: dict):
+    async def generate(self, request: dict, context: Context):
+        logging.debug(f"New Request ID: {context.id()}")
        if self.disaggregation_strategy == DisaggregationStrategy.DECODE_FIRST:
            prefill_response = None
            # If operating under decode_first strategy, the decode handler needs to trigger
@@ -188,12 +203,16 @@ class DecodeHandler(HandlerBase):
            response_count = 0
            # Do not yield the prefill response directly.
            # Instead, capture it and extract the state.
-            async for res in self.remote_prefill(request):
+            async for res in self.remote_prefill(request, context):
                prefill_response = res
                response_count += 1
                if response_count > 1:
                    raise ValueError("Prefill response should be generated only once.")
+            if context.is_stopped() or context.is_killed():
+                logging.debug(f"Aborted Remote Request ID: {context.id()}")
+                return
            response_data = (
                prefill_response.data() if prefill_response is not None else None
            )
@@ -204,5 +223,5 @@ class DecodeHandler(HandlerBase):
            if prefill_response is not None and response_data is not None:
                request["disaggregated_params"] = response_data["disaggregated_params"]
-        async for res in self.generate_locally(request):
+        async for res in self.generate_locally(request, context):
            yield res
--- a/tests/fault_tolerance/cancellation/__init__.py
+++ b/tests/fault_tolerance/cancellation/__init__.py
--- a/tests/fault_tolerance/cancellation/test_trtllm.py
+++ b/tests/fault_tolerance/cancellation/test_trtllm.py
--- a/tests/fault_tolerance/test_request_cancellation.py
+++ b/tests/fault_tolerance/test_request_cancellation.py
@@ -3,13 +3,17 @@
 import logging
 import os
-import re
 import shutil
 import time
 import pytest
-import requests
+from tests.fault_tolerance.cancellation.utils import (
+    DynamoFrontendProcess,
+    read_log_content,
+    send_request_and_cancel,
+    strip_ansi_codes,
+)
 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.engine_process import FRONTEND_PORT
 from tests.utils.managed_process import ManagedProcess
@@ -18,35 +22,6 @@ from tests.utils.payloads import check_health_generate, check_models_api
 logger = logging.getLogger(__name__)
-class DynamoFrontendProcess(ManagedProcess):
-    """Process manager for Dynamo frontend"""
-    def __init__(self, request):
-        command = ["python", "-m", "dynamo.frontend"]
-        # Set debug logging environment
-        env = os.environ.copy()
-        env["DYN_LOG"] = "debug"
-        log_dir = f"{request.node.name}_frontend"
-        # Clean up any existing log directory from previous runs
-        try:
-            shutil.rmtree(log_dir)
-            logger.info(f"Cleaned up existing log directory: {log_dir}")
-        except FileNotFoundError:
-            # Directory doesn't exist, which is fine
-            pass
-        super().__init__(
-            command=command,
-            env=env,
-            display_output=True,
-            terminate_existing=True,
-            log_dir=log_dir,
-        )
 class DynamoWorkerProcess(ManagedProcess):
    """Process manager for Dynamo worker with vLLM backend"""
@@ -137,138 +112,6 @@ class DynamoWorkerProcess(ManagedProcess):
        return False
-def send_completion_request(
-    prompt: str, max_tokens: int, timeout: int | float = 120
-) -> requests.Response:
-    """Send a completion request to the frontend"""
-    payload = {
-        "model": FAULT_TOLERANCE_MODEL_NAME,
-        "prompt": prompt,
-        "max_tokens": max_tokens,
-    }
-    headers = {"Content-Type": "application/json"}
-    logger.info(
-        f"Sending completion request with prompt: '{prompt[:50]}...' and max_tokens: {max_tokens}"
-    )
-    session = requests.Session()
-    try:
-        response = session.post(
-            "http://localhost:8000/v1/completions",
-            headers=headers,
-            json=payload,
-            timeout=timeout,
-        )
-        logger.info(f"Received response with status code: {response.status_code}")
-        return response
-    except requests.exceptions.Timeout:
-        logger.error(f"Request timed out after {timeout} seconds")
-        raise
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Request failed with error: {e}")
-        raise
-def send_chat_completion_request(
-    prompt: str, max_tokens: int, timeout: int | float = 120, stream: bool = False
-) -> requests.Response:
-    """Send a chat completion request to the frontend"""
-    payload = {
-        "model": FAULT_TOLERANCE_MODEL_NAME,
-        "messages": [{"role": "user", "content": prompt}],
-        "max_tokens": max_tokens,
-        "stream": stream,
-    }
-    headers = {"Content-Type": "application/json"}
-    logger.info(
-        f"Sending chat completion request (stream={stream}) with prompt: '{prompt[:50]}...' and max_tokens: {max_tokens}"
-    )
-    session = requests.Session()
-    try:
-        response = session.post(
-            "http://localhost:8000/v1/chat/completions",
-            headers=headers,
-            json=payload,
-            timeout=timeout,
-            stream=stream,
-        )
-        logger.info(f"Received response with status code: {response.status_code}")
-        return response
-    except requests.exceptions.Timeout:
-        logger.error(f"Request timed out after {timeout} seconds")
-        raise
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Request failed with error: {e}")
-        raise
-def send_request_and_cancel(
-    request_type: str = "completion",
-    timeout: int | float = 1,
-    use_long_prompt: bool = False,
-):
-    """Send a request with short timeout to trigger cancellation"""
-    logger.info(f"Sending {request_type} request to be cancelled...")
-    prompt = "Tell me a very long and detailed story about the history of artificial intelligence, including all major milestones, researchers, and breakthroughs?"
-    if use_long_prompt:
-        prompt += " Make sure it is" + " long" * 8000 + "!"
-    try:
-        if request_type == "completion":
-            response = send_completion_request(prompt, 8000, timeout)
-        elif request_type == "chat_completion":
-            response = send_chat_completion_request(prompt, 8000, timeout, False)
-        elif request_type == "chat_completion_stream":
-            response = send_chat_completion_request(prompt, 8000, timeout, True)
-            # Read a few responses and then disconnect
-            if response.status_code == 200:
-                itr_count, max_itr = 0, 5
-                try:
-                    for res in response.iter_lines():
-                        logger.info(f"Received response {itr_count + 1}: {res[:50]}...")
-                        itr_count += 1
-                        if itr_count >= max_itr:
-                            break
-                        time.sleep(0.1)
-                except Exception as e:
-                    pytest.fail(f"Stream reading failed: {e}")
-            response.close()
-            raise Exception("Closed response")
-        else:
-            pytest.fail(f"Unknown request type: {request_type}")
-        pytest.fail(
-            f"{request_type} request completed unexpectedly - should have been cancelled"
-        )
-    except Exception as e:
-        logger.info(f"{request_type} request was cancelled: {e}")
-def read_log_content(log_path: str | None) -> str:
-    """Read log content from a file"""
-    if log_path is None:
-        pytest.fail("Log path is None - cannot read log content")
-    try:
-        with open(log_path, "r") as f:
-            return f.read()
-    except Exception as e:
-        pytest.fail(f"Could not read log file {log_path}: {e}")
-def strip_ansi_codes(text: str) -> str:
-    """Remove ANSI color codes from text"""
-    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
-    return ansi_escape.sub("", text)
 def verify_request_cancelled(
    frontend_process: DynamoFrontendProcess,
    worker_process: DynamoWorkerProcess,

--- a/tests/fault_tolerance/cancellation/utils.py
+++ b/tests/fault_tolerance/cancellation/utils.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+import re
+import shutil
+import time
+import pytest
+import requests
+from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
+from tests.utils.engine_process import FRONTEND_PORT
+from tests.utils.managed_process import ManagedProcess
+logger = logging.getLogger(__name__)
+class DynamoFrontendProcess(ManagedProcess):
+    """Process manager for Dynamo frontend"""
+    def __init__(self, request):
+        command = ["python", "-m", "dynamo.frontend"]
+        # Set debug logging environment
+        env = os.environ.copy()
+        env["DYN_LOG"] = "debug"
+        log_dir = f"{request.node.name}_frontend"
+        # Clean up any existing log directory from previous runs
+        try:
+            shutil.rmtree(log_dir)
+            logger.info(f"Cleaned up existing log directory: {log_dir}")
+        except FileNotFoundError:
+            # Directory doesn't exist, which is fine
+            pass
+        super().__init__(
+            command=command,
+            env=env,
+            display_output=True,
+            terminate_existing=True,
+            log_dir=log_dir,
+        )
+def send_completion_request(
+    prompt: str, max_tokens: int, timeout: int | float = 120
+) -> requests.Response:
+    """Send a completion request to the frontend"""
+    payload = {
+        "model": FAULT_TOLERANCE_MODEL_NAME,
+        "prompt": prompt,
+        "max_tokens": max_tokens,
+    }
+    headers = {"Content-Type": "application/json"}
+    logger.info(
+        f"Sending completion request with prompt: '{prompt[:50]}...' and max_tokens: {max_tokens}"
+    )
+    session = requests.Session()
+    try:
+        response = session.post(
+            f"http://localhost:{FRONTEND_PORT}/v1/completions",
+            headers=headers,
+            json=payload,
+            timeout=timeout,
+        )
+        logger.info(f"Received response with status code: {response.status_code}")
+        return response
+    except requests.exceptions.Timeout:
+        logger.error(f"Request timed out after {timeout} seconds")
+        raise
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Request failed with error: {e}")
+        raise
+def send_chat_completion_request(
+    prompt: str, max_tokens: int, timeout: int | float = 120, stream: bool = False
+) -> requests.Response:
+    """Send a chat completion request to the frontend"""
+    payload = {
+        "model": FAULT_TOLERANCE_MODEL_NAME,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "stream": stream,
+    }
+    headers = {"Content-Type": "application/json"}
+    logger.info(
+        f"Sending chat completion request (stream={stream}) with prompt: '{prompt[:50]}...' and max_tokens: {max_tokens}"
+    )
+    session = requests.Session()
+    try:
+        response = session.post(
+            f"http://localhost:{FRONTEND_PORT}/v1/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=timeout,
+            stream=stream,
+        )
+        logger.info(f"Received response with status code: {response.status_code}")
+        return response
+    except requests.exceptions.Timeout:
+        logger.error(f"Request timed out after {timeout} seconds")
+        raise
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Request failed with error: {e}")
+        raise
+def send_request_and_cancel(
+    request_type: str = "completion",
+    timeout: int | float = 1,
+    use_long_prompt: bool = False,
+):
+    """Send a request with short timeout to trigger cancellation"""
+    logger.info(f"Sending {request_type} request to be cancelled...")
+    prompt = "Tell me a very long and detailed story about the history of artificial intelligence, including all major milestones, researchers, and breakthroughs?"
+    if use_long_prompt:
+        prompt += " Make sure it is" + " long" * 8000 + "!"
+    try:
+        if request_type == "completion":
+            response = send_completion_request(prompt, 8000, timeout)
+        elif request_type == "chat_completion":
+            response = send_chat_completion_request(prompt, 8000, timeout, False)
+        elif request_type == "chat_completion_stream":
+            response = send_chat_completion_request(prompt, 8000, timeout, True)
+            # Read a few responses and then disconnect
+            if response.status_code == 200:
+                itr_count, max_itr = 0, 5
+                try:
+                    for res in response.iter_lines():
+                        logger.info(f"Received response {itr_count + 1}: {res[:50]}...")
+                        itr_count += 1
+                        if itr_count >= max_itr:
+                            break
+                        time.sleep(0.1)
+                except Exception as e:
+                    pytest.fail(f"Stream reading failed: {e}")
+            response.close()
+            raise Exception("Closed response")
+        else:
+            pytest.fail(f"Unknown request type: {request_type}")
+        pytest.fail(
+            f"{request_type} request completed unexpectedly - should have been cancelled"
+        )
+    except Exception as e:
+        logger.info(f"{request_type} request was cancelled: {e}")
+def read_log_content(log_path: str | None) -> str:
+    """Read log content from a file"""
+    if log_path is None:
+        pytest.fail("Log path is None - cannot read log content")
+    try:
+        with open(log_path, "r") as f:
+            return f.read()
+    except Exception as e:
+        pytest.fail(f"Could not read log file {log_path}: {e}")
+def strip_ansi_codes(text: str) -> str:
+    """Remove ANSI color codes from text"""
+    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+    return ansi_escape.sub("", text)