Unverified Commit 934d03c5 authored by Jacky's avatar Jacky Committed by GitHub
Browse files

fix: Disable calls to abort() at TRT-LLM backend - temporary (#5827)


Signed-off-by: default avatarJacky <18255193+kthui@users.noreply.github.com>
parent f6d6b34e
......@@ -198,11 +198,12 @@ class HandlerBase:
)
# Abort the generation
# Temporary: Disabled on DECODE workers to prevent engine hangs in
# disaggregated setups where abort() may cause the engine to get stuck
if self.disaggregation_mode != DisaggregationMode.DECODE:
generation_result.abort()
logging.debug(f"Aborted Request ID: {context.id()}")
# Temporary:
# Disable calling abort() on the engine, which may get stuck if a
# sufficiently large number of concurrent requests is cancelled.
# Note to restore:
# call `generation_result.abort()`; and then
# log `logging.debug(f"Aborted Request ID: {context.id()}")`
# Clean up any remaining background task
for task in pending:
......
......@@ -96,7 +96,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
| **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | |
| **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | |
| **Request Migration** | 🚧<sup>3</sup> | ✅ | ✅ | ✅ | 🚧 | — | | | | |
| **Request Cancellation** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | — | | | |
| **Request Cancellation** | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | — | | | |
| **LoRA** | | | | | | | | — | | |
| **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | |
| **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — |
......@@ -106,6 +106,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
> 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing])
> 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme])
> 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle])
> 5. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
---
......
......@@ -38,6 +38,7 @@ pytestmark = [
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
]
......@@ -253,9 +254,6 @@ def test_request_cancellation_trtllm_aggregated(
logger.info(f"{description} detected successfully")
@pytest.mark.xfail(
reason="Decode worker cancellation is temporarily disabled", strict=True
)
@pytest.mark.timeout(195) # 3x average
def test_request_cancellation_trtllm_decode_cancel(
request, runtime_services_dynamic_ports, predownload_models
......@@ -432,9 +430,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
)
@pytest.mark.xfail(
reason="Decode worker cancellation is temporarily disabled", strict=True
)
@pytest.mark.xfail(reason="Test fails only on CI", strict=False)
@pytest.mark.timeout(195) # 3x average
def test_request_cancellation_trtllm_kv_transfer_cancel(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment