Unverified Commit 0086ebc6 authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

fix: add dedicated llmapi config for trtllm disagg kv routing example (#916)

parent 49517f2a
...@@ -143,8 +143,6 @@ dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml ...@@ -143,8 +143,6 @@ dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers. cache between the context and generation workers.
NOTE: currently disaggregated serving with KV Routing may not work due to prefix cache hit is showing 0, though when it should not.
### Client ### Client
See [client](../llm/README.md#client) section to learn how to send request to the deployment. See [client](../llm/README.md#client) section to learn how to send request to the deployment.
......
...@@ -335,6 +335,10 @@ class BaseTensorrtLLMEngine: ...@@ -335,6 +335,10 @@ class BaseTensorrtLLMEngine:
# Using 0 as default value. If later data has # Using 0 as default value. If later data has
# lora_id, we need to verify if this is correct. # lora_id, we need to verify if this is correct.
lora_id = data.get("lora_id", 0) lora_id = data.get("lora_id", 0)
logger.debug(
f"publish stored event: event_id: {event_id}, token_ids: {token_ids}, num_block_tokens: {num_block_tokens}, block_hashes: {block_hashes}, lora_id: {lora_id}, parent_hash: {parent_hash}"
)
self._kv_event_publisher.publish_stored( self._kv_event_publisher.publish_stored(
event_id, event_id,
token_ids, token_ids,
...@@ -353,6 +357,10 @@ class BaseTensorrtLLMEngine: ...@@ -353,6 +357,10 @@ class BaseTensorrtLLMEngine:
self._partial_block_hashes.remove(block_hash) self._partial_block_hashes.remove(block_hash)
continue continue
block_hashes.append(block_hash) block_hashes.append(block_hash)
logger.debug(
f"publish removed event: event_id: {event_id}, block_hashes: {block_hashes}"
)
self._kv_event_publisher.publish_removed(event_id, block_hashes) self._kv_event_publisher.publish_removed(event_id, block_hashes)
return True return True
......
...@@ -31,7 +31,7 @@ Router: ...@@ -31,7 +31,7 @@ Router:
TensorRTLLMWorker: TensorRTLLMWorker:
engine_args: "configs/llm_api_config_disagg_router.yaml" engine_args: "configs/llm_api_config_disagg_router.yaml"
llmapi-disaggregated-config: "configs/llmapi_disagg_configs/single_node_config.yaml" llmapi-disaggregated-config: "configs/llmapi_disagg_router_configs/single_node_config.yaml"
remote-prefill: true remote-prefill: true
min-prefill-workers: 1 min-prefill-workers: 1
router: kv router: kv
...@@ -42,7 +42,7 @@ TensorRTLLMWorker: ...@@ -42,7 +42,7 @@ TensorRTLLMWorker:
TensorRTLLMPrefillWorker: TensorRTLLMPrefillWorker:
engine_args: "configs/llm_api_config_disagg_router.yaml" engine_args: "configs/llm_api_config_disagg_router.yaml"
llmapi-disaggregated-config: "configs/llmapi_disagg_configs/single_node_config.yaml" llmapi-disaggregated-config: "configs/llmapi_disagg_router_configs/single_node_config.yaml"
router: round-robin router: round-robin
ServiceArgs: ServiceArgs:
workers: 1 workers: 1
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This will overwrite the llm_api_config.yaml
# TODO: Specifying the context and generation servers in the config file is
# bit confusing. Investigate if we can clean this up.
hostname: localhost
port: 8080
trust_remote_code: true
backend: pytorch
context_servers:
num_instances: 1
tensor_parallel_size: 1
max_num_tokens: 10240
max_batch_size: 16
enable_chunked_prefill: false
kv_cache_config:
free_gpu_memory_fraction: 0.40
event_buffer_max_size: 1024
enable_block_reuse: true
pytorch_backend_config:
enable_overlap_scheduler: false
use_cuda_graph: false
enable_iter_perf_stats: true
urls:
- "localhost:8001"
generation_servers:
num_instances: 1
tensor_parallel_size: 1
max_num_tokens: 256
max_batch_size: 256
kv_cache_config:
free_gpu_memory_fraction: 0.40
event_buffer_max_size: 1024
enable_block_reuse: true
pytorch_backend_config:
enable_overlap_scheduler: true
use_cuda_graph: false
enable_iter_perf_stats: true
urls:
- "localhost:8002"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment