Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
5fe5a950
Unverified
Commit
5fe5a950
authored
Jun 27, 2025
by
Ryan McCormick
Committed by
GitHub
Jun 26, 2025
Browse files
feat: Add experimental WideEP + EPLB aggregated example for TRTLLM (#1652)
parent
f11fc3f3
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
65 additions
and
13 deletions
+65
-13
container/Dockerfile.tensorrt_llm
container/Dockerfile.tensorrt_llm
+1
-1
container/build.sh
container/build.sh
+2
-2
examples/tensorrt_llm/common/parser.py
examples/tensorrt_llm/common/parser.py
+2
-1
examples/tensorrt_llm/configs/deepseek_r1/multinode/README.md
...ples/tensorrt_llm/configs/deepseek_r1/multinode/README.md
+11
-0
examples/tensorrt_llm/configs/deepseek_r1/multinode/agg_DEP16_dsr1.yaml
...rrt_llm/configs/deepseek_r1/multinode/agg_DEP16_dsr1.yaml
+1
-1
examples/tensorrt_llm/configs/deepseek_r1/multinode/agg_wide_ep.yaml
...nsorrt_llm/configs/deepseek_r1/multinode/agg_wide_ep.yaml
+37
-0
examples/tensorrt_llm/configs/deepseek_r1/multinode/eplb.yaml
...ples/tensorrt_llm/configs/deepseek_r1/multinode/eplb.yaml
+7
-0
examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_script.sh
...tensorrt_llm/configs/deepseek_r1/multinode/srun_script.sh
+4
-4
launch/dynamo-run/src/subprocess/trtllm_inc.py
launch/dynamo-run/src/subprocess/trtllm_inc.py
+0
-4
No files found.
container/Dockerfile.tensorrt_llm
View file @
5fe5a950
...
...
@@ -14,7 +14,7 @@
# limitations under the License.
ARG BASE_IMAGE="nvcr.io/nvidia/pytorch"
ARG BASE_IMAGE_TAG="25.0
4
-py3"
ARG BASE_IMAGE_TAG="25.0
5
-py3"
ARG RELEASE_BUILD
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
...
...
container/build.sh
View file @
5fe5a950
...
...
@@ -58,7 +58,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
# Base Images
TENSORRTLLM_BASE_IMAGE
=
nvcr.io/nvidia/pytorch
TENSORRTLLM_BASE_IMAGE_TAG
=
25.0
4
-py3
TENSORRTLLM_BASE_IMAGE_TAG
=
25.0
5
-py3
# Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
# we need to build the TensorRT-LLM wheel from source.
...
...
@@ -94,7 +94,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
# TensorRT-LLM PyPI index URL
TENSORRTLLM_INDEX_URL
=
"https://pypi.python.org/simple"
DEFAULT_TENSORRTLLM_PIP_WHEEL
=
"tensorrt-llm==
0.21
.0rc0"
DEFAULT_TENSORRTLLM_PIP_WHEEL
=
"tensorrt-llm==
1.0
.0rc0"
TENSORRTLLM_PIP_WHEEL
=
""
...
...
examples/tensorrt_llm/common/parser.py
View file @
5fe5a950
...
...
@@ -52,7 +52,6 @@ class LLMAPIConfig:
def
to_dict
(
self
)
->
Dict
[
str
,
Any
]:
data
=
{
"pytorch_backend_config"
:
self
.
pytorch_backend_config
,
"kv_cache_config"
:
self
.
kv_cache_config
,
"speculative_config"
:
self
.
speculative_config
,
"skip_tokenizer_init"
:
self
.
skip_tokenizer_init
,
...
...
@@ -62,6 +61,8 @@ class LLMAPIConfig:
return
data
def
update_sub_configs
(
self
,
other_config
:
Dict
[
str
,
Any
]):
# TODO: Consider removing pytorch_backend_config parsing as this section
# was collapsed to top level config fields in recent TRTLLM versions.
if
"pytorch_backend_config"
in
other_config
:
self
.
pytorch_backend_config
=
PyTorchConfig
(
**
other_config
[
"pytorch_backend_config"
]
...
...
examples/tensorrt_llm/configs/deepseek_r1/multinode/README.md
View file @
5fe5a950
...
...
@@ -110,6 +110,15 @@ export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
# the container. See the MOUNTS variable in srun_script.sh
export
ENGINE_CONFIG
=
"/mnt/agg_DEP16_dsr1.yaml"
# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The produce of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
# total GPUs necessary to satisfy the requested parallelism. For example,
# 4 nodes x 4 gpus/node = 16 gpus total for TP16/EP16.
export
NUM_NODES
=
4
# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this.
export
NUM_GPUS_PER_NODE
=
4
# Launches frontend + etcd/nats on current (head) node.
# Launches one large trtllm worker across multiple nodes via MPI tasks.
./srun_script.sh
...
...
@@ -198,3 +207,5 @@ pkill srun
H100 nodes with FP8 weights, but this hasn't been tested yet.
- This example only tests an aggregated model setup for now. A disaggregated
serving example will be added in the near future.
- WideEP configs in this directory are still being tested. A WideEP specific
example with documentation will be added once ready.
examples/tensorrt_llm/configs/deepseek_r1/multinode/agg_DEP16_dsr1.yaml
View file @
5fe5a950
...
...
@@ -8,7 +8,7 @@ max_batch_size: 256
max_num_tokens
:
256
max_seq_len
:
8448
kv_cache_config
:
free_gpu_memory_fraction
:
0.
8
free_gpu_memory_fraction
:
0.
7
use_cuda_graph
:
true
cuda_graph_padding_enabled
:
true
cuda_graph_batch_sizes
:
...
...
examples/tensorrt_llm/configs/deepseek_r1/multinode/agg_wide_ep.yaml
0 → 100644
View file @
5fe5a950
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
backend
:
pytorch
# WideEP related settings
moe_backend
:
WideEP
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
#
# If you want to set this value explicitly, one recommendation is below:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
moe_load_balancer
:
/mnt/eplb.yaml
# 36 TP/EP following example from:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md
tensor_parallel_size
:
36
moe_expert_parallel_size
:
36
enable_attention_dp
:
true
max_batch_size
:
256
max_num_tokens
:
256
max_seq_len
:
8448
kv_cache_config
:
free_gpu_memory_fraction
:
0.7
use_cuda_graph
:
true
cuda_graph_padding_enabled
:
true
cuda_graph_batch_sizes
:
-
1
-
2
-
4
-
8
-
16
-
32
-
64
-
128
-
256
kv_cache_dtype
:
fp8
examples/tensorrt_llm/configs/deepseek_r1/multinode/eplb.yaml
0 → 100644
View file @
5fe5a950
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# moe_load_balancer settings for TRTLLM based on:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer
num_slots
:
288
layer_updates_per_iter
:
2
examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_script.sh
View file @
5fe5a950
...
...
@@ -14,9 +14,9 @@ DEFAULT_MOUNT="${PWD}:/mnt"
MOUNTS
=
"
${
MOUNTS
:-${
DEFAULT_MOUNT
}}
"
# Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes.
# For 8xH100 nodes as an example, you may set this to 2 nodes x
16
gpus
, or 4 nodes x 32 gpus
instead.
NUM_NODES
=
4
NUM_GPUS_
TOTAL
=
16
# For 8xH100 nodes as an example, you may set this to 2 nodes x
8
gpus
/node
instead.
NUM_NODES
=
${
NUM_NODES
:-
4
}
NUM_GPUS_
PER_NODE
=
${
NUM_GPUS_PER_NODE
:-
4
}
# Automate settings of certain variables for convenience, but you are free
# to manually set these for more control as well.
...
...
@@ -66,6 +66,6 @@ srun \
-A
"
${
ACCOUNT
}
"
\
-J
"
${
ACCOUNT
}
-dynamo.trtllm"
\
--nodes
"
${
NUM_NODES
}
"
\
--ntasks
"
${
NUM_GPUS_
TOTAL
}
"
\
--ntasks
-per-node
"
${
NUM_GPUS_
PER_NODE
}
"
\
--jobid
"
${
SLURM_JOB_ID
}
"
\
/mnt/start_trtllm_worker.sh &
launch/dynamo-run/src/subprocess/trtllm_inc.py
View file @
5fe5a950
...
...
@@ -355,10 +355,6 @@ async def init(runtime: DistributedRuntime, config: Config):
"tensor_parallel_size"
:
config
.
tensor_parallel_size
,
"backend"
:
"pytorch"
,
"skip_tokenizer_init"
:
True
,
"disable_log_requests"
:
True
,
"enable_prefix_caching"
:
True
,
# KV routing relies on logging KV metrics
"disable_log_stats"
:
False
,
}
if
config
.
extra_engine_args
!=
""
:
# TODO: Support extra engine args from json file as well.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment