Unverified Commit 32eaecb4 authored by Jacky's avatar Jacky Committed by GitHub
Browse files

fix: Set max_tokens_in_buffer on TRT-LLM cache transceiver (#4703)


Signed-off-by: default avatarJacky <18255193+kthui@users.noreply.github.com>
parent 59e6873e
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
Test Execution Times (Last Run: 2025-12-09): Test Execution Times (Last Run: 2025-12-13):
- test_request_cancellation_trtllm_aggregated: ~45s (gpu_1) - test_request_cancellation_trtllm_aggregated: ~45s (gpu_1)
- test_request_cancellation_trtllm_decode_cancel: ~115s (gpu_1) - test_request_cancellation_trtllm_decode_cancel: ~65s (gpu_1)
- test_request_cancellation_trtllm_prefill_cancel: ~115s (gpu_1) - test_request_cancellation_trtllm_prefill_cancel: ~65s (gpu_1)
- test_request_cancellation_trtllm_kv_transfer_cancel: ~115s (gpu_1, xfail) - test_request_cancellation_trtllm_kv_transfer_cancel: ~65s (gpu_1)
- Total: ~390s (0:06:30) - Total: ~240s x2 request planes = ~480s (0:08:00)
""" """
import logging import logging
...@@ -72,8 +72,6 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -72,8 +72,6 @@ class DynamoWorkerProcess(ManagedProcess):
FAULT_TOLERANCE_MODEL_NAME, FAULT_TOLERANCE_MODEL_NAME,
"--disaggregation-mode", "--disaggregation-mode",
mode, mode,
"--free-gpu-memory-fraction",
"0.45",
"--max-seq-len", "--max-seq-len",
"16384", "16384",
"--max-num-tokens", "--max-num-tokens",
...@@ -83,8 +81,11 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -83,8 +81,11 @@ class DynamoWorkerProcess(ManagedProcess):
] ]
if mode != "prefill_and_decode": if mode != "prefill_and_decode":
with open("test_request_cancellation_trtllm_config.yaml", "w") as f: with open("test_request_cancellation_trtllm_config.yaml", "w") as f:
f.write("cache_transceiver_config:\n backend: DEFAULT\n") f.write(
"cache_transceiver_config:\n backend: DEFAULT\n max_tokens_in_buffer: 16384\n"
)
f.write("disable_overlap_scheduler: true\n") f.write("disable_overlap_scheduler: true\n")
f.write("kv_cache_config:\n max_tokens: 16384\n")
command += [ command += [
"--extra-engine-args", "--extra-engine-args",
"test_request_cancellation_trtllm_config.yaml", "test_request_cancellation_trtllm_config.yaml",
...@@ -164,7 +165,7 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -164,7 +165,7 @@ class DynamoWorkerProcess(ManagedProcess):
return super().__exit__(exc_type, exc_val, exc_tb) return super().__exit__(exc_type, exc_val, exc_tb)
@pytest.mark.timeout(140) # 3x average @pytest.mark.timeout(135) # 3x average
def test_request_cancellation_trtllm_aggregated( def test_request_cancellation_trtllm_aggregated(
request, runtime_services_dynamic_ports, predownload_models request, runtime_services_dynamic_ports, predownload_models
): ):
...@@ -251,7 +252,7 @@ def test_request_cancellation_trtllm_aggregated( ...@@ -251,7 +252,7 @@ def test_request_cancellation_trtllm_aggregated(
logger.info(f"{description} detected successfully") logger.info(f"{description} detected successfully")
@pytest.mark.timeout(350) # 3x average @pytest.mark.timeout(195) # 3x average
def test_request_cancellation_trtllm_decode_cancel( def test_request_cancellation_trtllm_decode_cancel(
request, runtime_services_dynamic_ports, predownload_models request, runtime_services_dynamic_ports, predownload_models
): ):
...@@ -335,7 +336,7 @@ def test_request_cancellation_trtllm_decode_cancel( ...@@ -335,7 +336,7 @@ def test_request_cancellation_trtllm_decode_cancel(
) )
@pytest.mark.timeout(350) # 3x average @pytest.mark.timeout(195) # 3x average
def test_request_cancellation_trtllm_prefill_cancel( def test_request_cancellation_trtllm_prefill_cancel(
request, runtime_services_dynamic_ports, predownload_models request, runtime_services_dynamic_ports, predownload_models
): ):
...@@ -427,11 +428,8 @@ def test_request_cancellation_trtllm_prefill_cancel( ...@@ -427,11 +428,8 @@ def test_request_cancellation_trtllm_prefill_cancel(
) )
@pytest.mark.timeout(350) # 3x average @pytest.mark.xfail(reason="Test fails only on CI", strict=False)
@pytest.mark.xfail( @pytest.mark.timeout(195) # 3x average
reason="May fail due to unknown reason with TRT-LLM or backend implementation",
strict=False,
)
def test_request_cancellation_trtllm_kv_transfer_cancel( def test_request_cancellation_trtllm_kv_transfer_cancel(
request, runtime_services_dynamic_ports, predownload_models request, runtime_services_dynamic_ports, predownload_models
): ):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment