fix: Fix message truncation in disagg flow (#1572)

84454ab4 · Tanmay Verma · GitHub · 4abab20f · 84454ab4 · 84454ab4
Unverified Commit 84454ab4 authored Jun 17, 2025 by Tanmay Verma Committed by GitHub Jun 17, 2025
2 changed files
--- a/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
@@ -30,9 +30,7 @@ context_servers:
  max_batch_size: 16
  enable_chunked_prefill: false
  kv_cache_config:
-    free_gpu_memory_fraction: 0.40
-  cache_transceiver_config:
-    max_num_tokens: 10240
+    free_gpu_memory_fraction: 0.75
  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
  # Overlap scheduler not currently supported in context-only
@@ -47,9 +45,7 @@ generation_servers:
  max_num_tokens: 256
  max_batch_size: 256
  kv_cache_config:
-    free_gpu_memory_fraction: 0.40
-  cache_transceiver_config:
-    max_num_tokens: 256
+    free_gpu_memory_fraction: 0.75
  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
  disable_overlap_scheduler: false

--- a/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
@@ -30,11 +30,9 @@ context_servers:
  max_batch_size: 16
  enable_chunked_prefill: false
  kv_cache_config:
-    free_gpu_memory_fraction: 0.40
+    free_gpu_memory_fraction: 0.75
    event_buffer_max_size: 1024
    enable_block_reuse: true
-  cache_transceiver_config:
-    max_num_tokens: 10240
  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
  # Overlap scheduler not currently supported in context-only
@@ -50,11 +48,9 @@ generation_servers:
  max_num_tokens: 256
  max_batch_size: 256
  kv_cache_config:
-    free_gpu_memory_fraction: 0.40
+    free_gpu_memory_fraction: 0.75
    event_buffer_max_size: 1024
    enable_block_reuse: true
-  cache_transceiver_config:
-    max_num_tokens: 256
  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
  disable_overlap_scheduler: false