Unverified Commit 934d03c5 authored by Jacky's avatar Jacky Committed by GitHub
Browse files

fix: Disable calls to abort() at TRT-LLM backend - temporary (#5827)


Signed-off-by: default avatarJacky <18255193+kthui@users.noreply.github.com>
parent f6d6b34e
...@@ -198,11 +198,12 @@ class HandlerBase: ...@@ -198,11 +198,12 @@ class HandlerBase:
) )
# Abort the generation # Abort the generation
# Temporary: Disabled on DECODE workers to prevent engine hangs in # Temporary:
# disaggregated setups where abort() may cause the engine to get stuck # Disable calling abort() on the engine, which may get stuck if a
if self.disaggregation_mode != DisaggregationMode.DECODE: # sufficiently large number of concurrent requests is cancelled.
generation_result.abort() # Note to restore:
logging.debug(f"Aborted Request ID: {context.id()}") # call `generation_result.abort()`; and then
# log `logging.debug(f"Aborted Request ID: {context.id()}")`
# Clean up any remaining background task # Clean up any remaining background task
for task in pending: for task in pending:
......
...@@ -96,7 +96,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full ...@@ -96,7 +96,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
| **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | | | **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | |
| **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | | | **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | |
| **Request Migration** | 🚧<sup>3</sup> | ✅ | ✅ | ✅ | 🚧 | — | | | | | | **Request Migration** | 🚧<sup>3</sup> | ✅ | ✅ | ✅ | 🚧 | — | | | | |
| **Request Cancellation** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | — | | | | | **Request Cancellation** | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | — | | | |
| **LoRA** | | | | | | | | — | | | | **LoRA** | | | | | | | | — | | |
| **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | | | **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | |
| **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — | | **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — |
...@@ -106,6 +106,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full ...@@ -106,6 +106,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
> 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing]) > 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing])
> 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme]) > 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme])
> 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle]) > 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle])
> 5. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
--- ---
......
...@@ -38,6 +38,7 @@ pytestmark = [ ...@@ -38,6 +38,7 @@ pytestmark = [
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True), pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
] ]
...@@ -253,9 +254,6 @@ def test_request_cancellation_trtllm_aggregated( ...@@ -253,9 +254,6 @@ def test_request_cancellation_trtllm_aggregated(
logger.info(f"{description} detected successfully") logger.info(f"{description} detected successfully")
@pytest.mark.xfail(
reason="Decode worker cancellation is temporarily disabled", strict=True
)
@pytest.mark.timeout(195) # 3x average @pytest.mark.timeout(195) # 3x average
def test_request_cancellation_trtllm_decode_cancel( def test_request_cancellation_trtllm_decode_cancel(
request, runtime_services_dynamic_ports, predownload_models request, runtime_services_dynamic_ports, predownload_models
...@@ -432,9 +430,6 @@ def test_request_cancellation_trtllm_prefill_cancel( ...@@ -432,9 +430,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
) )
@pytest.mark.xfail(
reason="Decode worker cancellation is temporarily disabled", strict=True
)
@pytest.mark.xfail(reason="Test fails only on CI", strict=False) @pytest.mark.xfail(reason="Test fails only on CI", strict=False)
@pytest.mark.timeout(195) # 3x average @pytest.mark.timeout(195) # 3x average
def test_request_cancellation_trtllm_kv_transfer_cancel( def test_request_cancellation_trtllm_kv_transfer_cancel(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment