feat: Add experimental WideEP + EPLB aggregated example for TRTLLM (#1652)

5fe5a950 · Ryan McCormick · GitHub · f11fc3f3 · 5fe5a950 · 5fe5a950
Unverified Commit 5fe5a950 authored Jun 27, 2025 by Ryan McCormick Committed by GitHub Jun 26, 2025
9 changed files
--- a/container/Dockerfile.tensorrt_llm
+++ b/container/Dockerfile.tensorrt_llm
@@ -14,7 +14,7 @@
 # limitations under the License.

 ARG BASE_IMAGE="nvcr.io/nvidia/pytorch"
-ARG BASE_IMAGE_TAG="25.04-py3"
+ARG BASE_IMAGE_TAG="25.05-py3"
 ARG RELEASE_BUILD

 # Define general architecture ARGs for supporting both x86 and aarch64 builds.

--- a/container/build.sh
+++ b/container/build.sh
@@ -58,7 +58,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")

 # Base Images
 TENSORRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
-TENSORRTLLM_BASE_IMAGE_TAG=25.04-py3
+TENSORRTLLM_BASE_IMAGE_TAG=25.05-py3

 # Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
 # we need to build the TensorRT-LLM wheel from source.
@@ -94,7 +94,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"

 # TensorRT-LLM PyPI index URL
 TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
-DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==0.21.0rc0"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc0"
 TENSORRTLLM_PIP_WHEEL=""



--- a/examples/tensorrt_llm/common/parser.py
+++ b/examples/tensorrt_llm/common/parser.py
@@ -52,7 +52,6 @@ class LLMAPIConfig:

    def to_dict(self) -> Dict[str, Any]:
        data = {
-            "pytorch_backend_config": self.pytorch_backend_config,
            "kv_cache_config": self.kv_cache_config,
            "speculative_config": self.speculative_config,
            "skip_tokenizer_init": self.skip_tokenizer_init,
@@ -62,6 +61,8 @@ class LLMAPIConfig:
        return data

    def update_sub_configs(self, other_config: Dict[str, Any]):
+        # TODO: Consider removing pytorch_backend_config parsing as this section
+        # was collapsed to top level config fields in recent TRTLLM versions.
        if "pytorch_backend_config" in other_config:
            self.pytorch_backend_config = PyTorchConfig(
                **other_config["pytorch_backend_config"]

--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/README.md
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/README.md
@@ -110,6 +110,15 @@ export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
 # the container. See the MOUNTS variable in srun_script.sh
 export ENGINE_CONFIG="/mnt/agg_DEP16_dsr1.yaml"

+# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
+# The produce of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
+# total GPUs necessary to satisfy the requested parallelism. For example,
+# 4 nodes x 4 gpus/node = 16 gpus total for TP16/EP16.
+export NUM_NODES=4
+
+# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this.
+export NUM_GPUS_PER_NODE=4
+
 # Launches frontend + etcd/nats on current (head) node.
 # Launches one large trtllm worker across multiple nodes via MPI tasks.
 ./srun_script.sh
@@ -198,3 +207,5 @@ pkill srun
  H100 nodes with FP8 weights, but this hasn't been tested yet.
 - This example only tests an aggregated model setup for now. A disaggregated
  serving example will be added in the near future.
+- WideEP configs in this directory are still being tested. A WideEP specific
+  example with documentation will be added once ready.
--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/agg_DEP16_dsr1.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/agg_DEP16_dsr1.yaml
@@ -8,7 +8,7 @@ max_batch_size: 256
 max_num_tokens: 256
 max_seq_len: 8448
 kv_cache_config:
-  free_gpu_memory_fraction: 0.8
+  free_gpu_memory_fraction: 0.7
 use_cuda_graph: true
 cuda_graph_padding_enabled: true
 cuda_graph_batch_sizes:

--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/agg_wide_ep.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/agg_wide_ep.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+backend: pytorch
+
+# WideEP related settings
+moe_backend: WideEP
+# moe_max_num_tokens will default to max_num_tokens if left unspecified.
+#
+# If you want to set this value explicitly, one recommendation is below:
+#   moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
+#   4096 = 256 * 16
+# moe_max_num_tokens: 4096
+moe_load_balancer: /mnt/eplb.yaml
+# 36 TP/EP following example from:
+# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md
+tensor_parallel_size: 36
+moe_expert_parallel_size: 36
+
+enable_attention_dp: true
+max_batch_size: 256
+max_num_tokens: 256
+max_seq_len: 8448
+kv_cache_config:
+  free_gpu_memory_fraction: 0.7
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
+- 1
+- 2
+- 4
+- 8
+- 16
+- 32
+- 64
+- 128
+- 256
+kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/eplb.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/eplb.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# moe_load_balancer settings for TRTLLM based on:
+# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer
+num_slots: 288
+layer_updates_per_iter: 2
--- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_script.sh
+++ b/examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_script.sh
@@ -14,9 +14,9 @@ DEFAULT_MOUNT="${PWD}:/mnt"
 MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"

 # Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes.
-# For 8xH100 nodes as an example, you may set this to 2 nodes x 16 gpus, or 4 nodes x 32 gpus instead.
-NUM_NODES=4
-NUM_GPUS_TOTAL=16
+# For 8xH100 nodes as an example, you may set this to 2 nodes x 8 gpus/node instead.
+NUM_NODES=${NUM_NODES:-4}
+NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}

 # Automate settings of certain variables for convenience, but you are free
 # to manually set these for more control as well.
@@ -66,6 +66,6 @@ srun \
  -A "${ACCOUNT}" \
  -J "${ACCOUNT}-dynamo.trtllm" \
  --nodes "${NUM_NODES}" \
-  --ntasks "${NUM_GPUS_TOTAL}" \
+  --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
  --jobid "${SLURM_JOB_ID}" \
  /mnt/start_trtllm_worker.sh &
--- a/launch/dynamo-run/src/subprocess/trtllm_inc.py
+++ b/launch/dynamo-run/src/subprocess/trtllm_inc.py
@@ -355,10 +355,6 @@ async def init(runtime: DistributedRuntime, config: Config):
        "tensor_parallel_size": config.tensor_parallel_size,
        "backend": "pytorch",
        "skip_tokenizer_init": True,
-        "disable_log_requests": True,
-        "enable_prefix_caching": True,
-        # KV routing relies on logging KV metrics
-        "disable_log_stats": False,
    }
    if config.extra_engine_args != "":
        # TODO: Support extra engine args from json file as well.