fix: Disable calls to abort() at TRT-LLM backend - temporary (#5827)

Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com>

fix: Disable calls to abort() at TRT-LLM backend - temporary (#5827)
Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com>
934d03c5 · Jacky · GitHub · f6d6b34e · 934d03c5 · 934d03c5
Unverified Commit 934d03c5 authored Jan 30, 2026 by Jacky Committed by GitHub Jan 30, 2026
3 changed files
--- a/components/src/dynamo/trtllm/request_handlers/handler_base.py
+++ b/components/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -198,11 +198,12 @@ class HandlerBase:
            )

            # Abort the generation
-            # Temporary: Disabled on DECODE workers to prevent engine hangs in
-            # disaggregated setups where abort() may cause the engine to get stuck
-            if self.disaggregation_mode != DisaggregationMode.DECODE:
-                generation_result.abort()
-                logging.debug(f"Aborted Request ID: {context.id()}")
+            # Temporary:
+            #   Disable calling abort() on the engine, which may get stuck if a
+            #   sufficiently large number of concurrent requests is cancelled.
+            # Note to restore:
+            #   call `generation_result.abort()`; and then
+            #   log `logging.debug(f"Aborted Request ID: {context.id()}")`

            # Clean up any remaining background task
            for task in pending:

--- a/docs/reference/feature-matrix.md
+++ b/docs/reference/feature-matrix.md
@@ -96,7 +96,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
 | **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | |
 | **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | |
 | **Request Migration** | 🚧<sup>3</sup> | ✅ | ✅ | ✅ | 🚧 | — | | | | |
-| **Request Cancellation** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | — | | | |
+| **Request Cancellation** | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | — | | | |
 | **LoRA** | | | | | | | | — | | |
 | **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | |
 | **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — |
@@ -106,6 +106,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
 > 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing])
 > 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme])
 > 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle])
+> 5. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.

 ---


--- a/tests/fault_tolerance/cancellation/test_trtllm.py
+++ b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -38,6 +38,7 @@ pytestmark = [
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
+    pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
 ]


@@ -253,9 +254,6 @@ def test_request_cancellation_trtllm_aggregated(
                logger.info(f"{description} detected successfully")


-@pytest.mark.xfail(
-    reason="Decode worker cancellation is temporarily disabled", strict=True
-)
 @pytest.mark.timeout(195)  # 3x average
 def test_request_cancellation_trtllm_decode_cancel(
    request, runtime_services_dynamic_ports, predownload_models
@@ -432,9 +430,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
                )


-@pytest.mark.xfail(
-    reason="Decode worker cancellation is temporarily disabled", strict=True
-)
 @pytest.mark.xfail(reason="Test fails only on CI", strict=False)
 @pytest.mark.timeout(195)  # 3x average
 def test_request_cancellation_trtllm_kv_transfer_cancel(