Unverified Commit 5fe5a950 authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

feat: Add experimental WideEP + EPLB aggregated example for TRTLLM (#1652)

parent f11fc3f3
......@@ -14,7 +14,7 @@
# limitations under the License.
ARG BASE_IMAGE="nvcr.io/nvidia/pytorch"
ARG BASE_IMAGE_TAG="25.04-py3"
ARG BASE_IMAGE_TAG="25.05-py3"
ARG RELEASE_BUILD
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
......
......@@ -58,7 +58,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
# Base Images
TENSORRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
TENSORRTLLM_BASE_IMAGE_TAG=25.04-py3
TENSORRTLLM_BASE_IMAGE_TAG=25.05-py3
# Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
# we need to build the TensorRT-LLM wheel from source.
......@@ -94,7 +94,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
# TensorRT-LLM PyPI index URL
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==0.21.0rc0"
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc0"
TENSORRTLLM_PIP_WHEEL=""
......
......@@ -52,7 +52,6 @@ class LLMAPIConfig:
def to_dict(self) -> Dict[str, Any]:
data = {
"pytorch_backend_config": self.pytorch_backend_config,
"kv_cache_config": self.kv_cache_config,
"speculative_config": self.speculative_config,
"skip_tokenizer_init": self.skip_tokenizer_init,
......@@ -62,6 +61,8 @@ class LLMAPIConfig:
return data
def update_sub_configs(self, other_config: Dict[str, Any]):
# TODO: Consider removing pytorch_backend_config parsing as this section
# was collapsed to top level config fields in recent TRTLLM versions.
if "pytorch_backend_config" in other_config:
self.pytorch_backend_config = PyTorchConfig(
**other_config["pytorch_backend_config"]
......
......@@ -110,6 +110,15 @@ export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
# the container. See the MOUNTS variable in srun_script.sh
export ENGINE_CONFIG="/mnt/agg_DEP16_dsr1.yaml"
# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The produce of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
# total GPUs necessary to satisfy the requested parallelism. For example,
# 4 nodes x 4 gpus/node = 16 gpus total for TP16/EP16.
export NUM_NODES=4
# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this.
export NUM_GPUS_PER_NODE=4
# Launches frontend + etcd/nats on current (head) node.
# Launches one large trtllm worker across multiple nodes via MPI tasks.
./srun_script.sh
......@@ -198,3 +207,5 @@ pkill srun
H100 nodes with FP8 weights, but this hasn't been tested yet.
- This example only tests an aggregated model setup for now. A disaggregated
serving example will be added in the near future.
- WideEP configs in this directory are still being tested. A WideEP specific
example with documentation will be added once ready.
......@@ -8,7 +8,7 @@ max_batch_size: 256
max_num_tokens: 256
max_seq_len: 8448
kv_cache_config:
free_gpu_memory_fraction: 0.8
free_gpu_memory_fraction: 0.7
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
backend: pytorch
# WideEP related settings
moe_backend: WideEP
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
#
# If you want to set this value explicitly, one recommendation is below:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
moe_load_balancer: /mnt/eplb.yaml
# 36 TP/EP following example from:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md
tensor_parallel_size: 36
moe_expert_parallel_size: 36
enable_attention_dp: true
max_batch_size: 256
max_num_tokens: 256
max_seq_len: 8448
kv_cache_config:
free_gpu_memory_fraction: 0.7
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
kv_cache_dtype: fp8
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# moe_load_balancer settings for TRTLLM based on:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer
num_slots: 288
layer_updates_per_iter: 2
......@@ -14,9 +14,9 @@ DEFAULT_MOUNT="${PWD}:/mnt"
MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
# Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes.
# For 8xH100 nodes as an example, you may set this to 2 nodes x 16 gpus, or 4 nodes x 32 gpus instead.
NUM_NODES=4
NUM_GPUS_TOTAL=16
# For 8xH100 nodes as an example, you may set this to 2 nodes x 8 gpus/node instead.
NUM_NODES=${NUM_NODES:-4}
NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
# Automate settings of certain variables for convenience, but you are free
# to manually set these for more control as well.
......@@ -66,6 +66,6 @@ srun \
-A "${ACCOUNT}" \
-J "${ACCOUNT}-dynamo.trtllm" \
--nodes "${NUM_NODES}" \
--ntasks "${NUM_GPUS_TOTAL}" \
--ntasks-per-node "${NUM_GPUS_PER_NODE}" \
--jobid "${SLURM_JOB_ID}" \
/mnt/start_trtllm_worker.sh &
......@@ -355,10 +355,6 @@ async def init(runtime: DistributedRuntime, config: Config):
"tensor_parallel_size": config.tensor_parallel_size,
"backend": "pytorch",
"skip_tokenizer_init": True,
"disable_log_requests": True,
"enable_prefix_caching": True,
# KV routing relies on logging KV metrics
"disable_log_stats": False,
}
if config.extra_engine_args != "":
# TODO: Support extra engine args from json file as well.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment