refactor: Refactor the TRTLLM examples remove dynamo SDK (#1884)

901715b5 · Tanmay Verma · GitHub · 5bf23d54 · 901715b5 · 901715b5
Unverified Commit 901715b5 authored Jul 14, 2025 by Tanmay Verma Committed by GitHub Jul 15, 2025
20 changed files
--- a/examples/tensorrt_llm/configs/engine_configs/decode_config.yaml
+++ b/examples/tensorrt_llm/configs/engine_configs/decode_config.yaml
@@ -24,4 +24,3 @@ disable_overlap_scheduler: false
 use_cuda_graph: true
 kv_cache_config:
  free_gpu_memory_fraction: 0.95
-
--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/agg_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/agg_config.yaml
--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/decode_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/decode_config.yaml
--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml
--- a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/agg_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/agg_config.yaml
--- a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/decode_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/decode_config.yaml
--- a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/prefill_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/prefill_config.yaml
--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/dep16_agg.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/dep16_agg.yaml
--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/eplb.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/eplb.yaml
--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_agg.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_agg.yaml
@@ -10,7 +10,7 @@ moe_backend: WideEP
 #   moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
 #   4096 = 256 * 16
 # moe_max_num_tokens: 4096
-moe_load_balancer: /mnt/engine_configs/eplb.yaml
+moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
 tensor_parallel_size: 16
 moe_expert_parallel_size: 16


--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_decode.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_decode.yaml
@@ -16,7 +16,7 @@ backend: pytorch

 # WideEP related settings
 moe_backend: WideEP
-moe_load_balancer: /mnt/engine_configs/eplb.yaml
+moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml

 # TP/EP/PP/DP
 tensor_parallel_size: 16

--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_prefill.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_prefill.yaml
@@ -16,7 +16,7 @@ backend: pytorch

 # WideEP related settings
 moe_backend: WideEP
-moe_load_balancer: /mnt/engine_configs/eplb.yaml
+moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml

 # TP/EP/PP/DP
 tensor_parallel_size: 16

--- a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/agg_config.yaml
+++ b/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/agg_config.yaml
--- a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/decode_config.yaml
+++ b/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/decode_config.yaml
--- a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/prefill_config.yaml
+++ b/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/prefill_config.yaml
--- a/examples/tensorrt_llm/configs/engine_configs/prefill_config.yaml
+++ b/examples/tensorrt_llm/configs/engine_configs/prefill_config.yaml
--- a/examples/tensorrt_llm/graphs/agg.py
+++ b/examples/tensorrt_llm/graphs/agg.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from components.frontend import Frontend
-from components.worker import TensorRTLLMWorker
-
-Frontend.link(TensorRTLLMWorker)
--- a/examples/tensorrt_llm/graphs/disagg.py
+++ b/examples/tensorrt_llm/graphs/disagg.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from components.frontend import Frontend
-from components.prefill_worker import TensorRTLLMPrefillWorker
-from components.worker import TensorRTLLMWorker
-
-Frontend.link(TensorRTLLMWorker).link(TensorRTLLMPrefillWorker)
--- a/examples/tensorrt_llm/kv-cache-tranfer.md
+++ b/examples/tensorrt_llm/kv-cache-tranfer.md
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+
+
+# KV Cache Transfer in Disaggregated Serving
+
+In disaggregated serving architectures, KV cache must be transferred between prefill and decode workers. TensorRT-LLM supports two methods for this transfer:
+
+## Default Method: UCX
+By default, TensorRT-LLM uses UCX (Unified Communication X) for KV cache transfer between prefill and decode workers. UCX provides high-performance communication optimized for GPU-to-GPU transfers.
+
+## Experimental Method: NIXL
+TensorRT-LLM also provides experimental support for using **NIXL** (NVIDIA Inference Xfer Library) for KV cache transfer. [NIXL](https://github.com/ai-dynamo/nixl) is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments.
+
+**Note:** NIXL support in TensorRT-LLM is experimental and is not suitable for production environments yet.
+
+## Using NIXL for KV Cache Transfer
+
+**Note:** NIXL backend for TensorRT-LLM is currently only supported on AMD64 (x86_64) architecture. If you're running on ARM64, you'll need to use the default UCX method for KV cache transfer.
+
+To enable NIXL for KV cache transfer in disaggregated serving:
+
+1. **Build the container with NIXL support:**
+   The TensorRT-LLM wheel must be built from source with NIXL support. The `./container/build.sh` script caches previously built TensorRT-LLM wheels to reduce build time. If you have previously built a TensorRT-LLM wheel without NIXL support, you must delete the cached wheel to force a rebuild with NIXL support.
+
+   **Remove cached TensorRT-LLM wheel (only if previously built without NIXL support):**
+   ```bash
+   rm -rf /tmp/trtllm_wheel
+   ```
+
+   **Build the container with NIXL support:**
+   ```bash
+   ./container/build.sh --framework tensorrtllm \
+     --use-default-experimental-tensorrtllm-commit \
+     --trtllm-use-nixl-kvcache-experimental
+   ```
+
+   **Note:** Both `--use-default-experimental-tensorrtllm-commit` and `--trtllm-use-nixl-kvcache-experimental` flags are required to enable NIXL support.
+
+2. **Run the containerized environment:**
+   See [run container](./README.md#run-container) section to learn how to start the container image built in previous step.
+
+3. **Start the disaggregated service:**
+   See [disaggregated serving](./README.md#disaggregated-serving) to see how to start the deployment.
+
+4. **Send the request:**
+   See [client](./README.md#client) section to learn how to send the request to deployment.
+
+**Important:** Ensure that ETCD and NATS services are running before starting the service.
+
+The container will automatically configure the appropriate environment variables (`TRTLLM_USE_NIXL_KVCACHE=1`) when built with the NIXL flag. The same container image can be used to use UCX for KV cache transfer.
+```bash
+unset TRTLLM_USE_NIXL_KVCACHE
+export TRTLLM_USE_UCX_KVCACHE=1
+```
\ No newline at end of file
--- a/examples/tensorrt_llm/launch/agg.sh
+++ b/examples/tensorrt_llm/launch/agg.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Environment variables with defaults
+export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
+export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
+
+# Setup cleanup trap
+cleanup() {
+    echo "Cleaning up background processes..."
+    kill $DYNAMO_PID 2>/dev/null || true
+    wait $DYNAMO_PID 2>/dev/null || true
+    echo "Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+
+# run clear_namespace
+python3 utils/clear_namespace.py --namespace dynamo
+
+# run ingress
+dynamo run in=http out=dyn --http-port=8000 &
+DYNAMO_PID=$!
+
+# run worker
+python3 components/worker.py \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args "$AGG_ENGINE_ARGS"