docs: update dynamo serve trtllm agg example yaml files (#600)

34be4418 · Ziqi Fan · GitHub · bb4e819c · 34be4418 · 34be4418
Commit 34be4418 authored Apr 10, 2025 by Ziqi Fan Committed by GitHub Apr 10, 2025
4 changed files
--- a/examples/tensorrt_llm/configs/agg.yaml
+++ b/examples/tensorrt_llm/configs/agg.yaml
@@ -20,12 +20,11 @@ Frontend:
 Processor:
  engine_args: "configs/llm_api_config.yaml"
-  block-size: 64
  router: round-robin
 TensorRTLLMWorker:
  engine_args: "configs/llm_api_config.yaml"
-  router: random
+  router: round-robin
  ServiceArgs:
    workers: 1
    resources:

--- a/examples/tensorrt_llm/configs/agg_router.yaml
+++ b/examples/tensorrt_llm/configs/agg_router.yaml
@@ -20,7 +20,6 @@ Frontend:
 Processor:
  engine_args: "configs/llm_api_config.yaml"
-  block-size: 64
  router: kv
 Router:
@@ -28,7 +27,7 @@ Router:
  min-workers: 1
 TensorRTLLMWorker:
-  engine_args: "configs/llm_api_config.yaml"
+  engine_args: "configs/llm_api_config_router.yaml"
  router: kv
  ServiceArgs:
    workers: 1

--- a/examples/tensorrt_llm/configs/llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config.yaml
@@ -22,19 +22,15 @@ model_path: null
 tensor_parallel_size: 1
 moe_expert_parallel_size: 1
 enable_attention_dp: false
-max_num_tokens: 10240
+max_num_tokens: 8192
 max_batch_size: 16
 trust_remote_code: true
 backend: pytorch
+enable_chunked_prefill: true
 kv_cache_config:
  free_gpu_memory_fraction: 0.95
-  # Uncomment to enable kv cache event collection
-  #event_buffer_max_size: 1024
-  #enable_block_reuse: true
 pytorch_backend_config:
-  enable_overlap_scheduler: false
+  enable_overlap_scheduler: true
-  use_cuda_graph: false
+  use_cuda_graph: true
-  # Uncomment to enable iter perf stats
-  #enable_iter_perf_stats: true
\ No newline at end of file
--- a/examples/tensorrt_llm/configs/llm_api_config_router.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config_router.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# In the case of disaggregated deployment, this config will apply to each server
+# and will be overwritten by the disaggregated config file
+model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+model_path: null
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 8192
+max_batch_size: 16
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+kv_cache_config:
+  free_gpu_memory_fraction: 0.95
+  event_buffer_max_size: 1024
+  enable_block_reuse: true
+pytorch_backend_config:
+  enable_overlap_scheduler: true
+  use_cuda_graph: true
+  enable_iter_perf_stats: true