fix: add dedicated llmapi config for trtllm disagg kv routing example (#916)

0086ebc6 · Ziqi Fan · GitHub · 49517f2a · 0086ebc6 · 0086ebc6
Unverified Commit 0086ebc6 authored Apr 30, 2025 by Ziqi Fan Committed by GitHub Apr 30, 2025
4 changed files
--- a/examples/tensorrt_llm/README.md
+++ b/examples/tensorrt_llm/README.md
@@ -143,8 +143,6 @@ dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
 We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
 cache between the context and generation workers.
-NOTE: currently disaggregated serving with KV Routing may not work due to prefix cache hit is showing 0, though when it should not.
 ### Client
 See [client](../llm/README.md#client) section to learn how to send request to the deployment.

--- a/examples/tensorrt_llm/common/base_engine.py
+++ b/examples/tensorrt_llm/common/base_engine.py
@@ -335,6 +335,10 @@ class BaseTensorrtLLMEngine:
                # Using 0 as default value. If later data has
                # lora_id, we need to verify if this is correct.
                lora_id = data.get("lora_id", 0)
+                logger.debug(
+                    f"publish stored event: event_id: {event_id}, token_ids: {token_ids}, num_block_tokens: {num_block_tokens}, block_hashes: {block_hashes}, lora_id: {lora_id}, parent_hash: {parent_hash}"
+                )
                self._kv_event_publisher.publish_stored(
                    event_id,
                    token_ids,
@@ -353,6 +357,10 @@ class BaseTensorrtLLMEngine:
                        self._partial_block_hashes.remove(block_hash)
                        continue
                    block_hashes.append(block_hash)
+                logger.debug(
+                    f"publish removed event: event_id: {event_id}, block_hashes: {block_hashes}"
+                )
                self._kv_event_publisher.publish_removed(event_id, block_hashes)
        return True

--- a/examples/tensorrt_llm/configs/disagg_router.yaml
+++ b/examples/tensorrt_llm/configs/disagg_router.yaml
@@ -31,7 +31,7 @@ Router:
 TensorRTLLMWorker:
  engine_args: "configs/llm_api_config_disagg_router.yaml"
-  llmapi-disaggregated-config: "configs/llmapi_disagg_configs/single_node_config.yaml"
+  llmapi-disaggregated-config: "configs/llmapi_disagg_router_configs/single_node_config.yaml"
  remote-prefill: true
  min-prefill-workers: 1
  router: kv
@@ -42,7 +42,7 @@ TensorRTLLMWorker:
 TensorRTLLMPrefillWorker:
  engine_args: "configs/llm_api_config_disagg_router.yaml"
-  llmapi-disaggregated-config: "configs/llmapi_disagg_configs/single_node_config.yaml"
+  llmapi-disaggregated-config: "configs/llmapi_disagg_router_configs/single_node_config.yaml"
  router: round-robin
  ServiceArgs:
    workers: 1

--- a/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This will overwrite the llm_api_config.yaml
+# TODO: Specifying the context and generation servers in the config file is
+# bit confusing. Investigate if we can clean this up.
+hostname: localhost
+port: 8080
+trust_remote_code: true
+backend: pytorch
+context_servers:
+  num_instances: 1
+  tensor_parallel_size: 1
+  max_num_tokens: 10240
+  max_batch_size: 16
+  enable_chunked_prefill: false
+  kv_cache_config:
+    free_gpu_memory_fraction: 0.40
+    event_buffer_max_size: 1024
+    enable_block_reuse: true
+  pytorch_backend_config:
+    enable_overlap_scheduler: false
+    use_cuda_graph: false
+    enable_iter_perf_stats: true
+  urls:
+      - "localhost:8001"
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 1
+  max_num_tokens: 256
+  max_batch_size: 256
+  kv_cache_config:
+    free_gpu_memory_fraction: 0.40
+    event_buffer_max_size: 1024
+    enable_block_reuse: true
+  pytorch_backend_config:
+    enable_overlap_scheduler: true
+    use_cuda_graph: false
+    enable_iter_perf_stats: true
+  urls:
+      - "localhost:8002"
\ No newline at end of file