Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
b6774b88
Unverified
Commit
b6774b88
authored
May 15, 2025
by
Ryan McCormick
Committed by
GitHub
May 15, 2025
Browse files
chore: Add example TRTLLM configs for Deepseek R1 (GB200) (#1099)
parent
a462280e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
228 additions
and
0 deletions
+228
-0
examples/tensorrt_llm/configs/deepseek_r1/agg.yaml
examples/tensorrt_llm/configs/deepseek_r1/agg.yaml
+36
-0
examples/tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
.../tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
+58
-0
examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml
examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml
+53
-0
examples/tensorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
...nsorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
+81
-0
No files found.
examples/tensorrt_llm/configs/deepseek_r1/agg.yaml
0 → 100644
View file @
b6774b88
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Frontend
:
# This is the client-facing model name, you can set this to anything you'd like.
served_model_name
:
"
nvidia/DeepSeek-R1-FP4"
endpoint_chat
:
dynamo.Processor.chat/completions
endpoint_completions
:
dynamo.Processor.completions
port
:
8000
Processor
:
engine_args
:
"
configs/deepseek_r1/agg_llm_api_config.yaml"
router
:
round-robin
# Parallelize preprocessing/tokenization to avoid bottlenecks
ServiceArgs
:
workers
:
5
TensorRTLLMWorker
:
engine_args
:
"
configs/deepseek_r1/agg_llm_api_config.yaml"
router
:
round-robin
ServiceArgs
:
workers
:
1
resources
:
gpu
:
4
examples/tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
0 → 100644
View file @
b6774b88
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model_name
:
"
nvidia/DeepSeek-R1-FP4"
backend
:
pytorch
# TP/EP/PP/DP
tensor_parallel_size
:
4
moe_expert_parallel_size
:
4
pipeline_parallel_size
:
1
enable_attention_dp
:
false
max_batch_size
:
256
# 8448 = 8192 ISL + 256 OSL
max_num_tokens
:
8448
max_seq_len
:
8448
kv_cache_config
:
# With dp attention disabled: high free_gpu_memory_fraction is fine.
free_gpu_memory_fraction
:
0.85
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
pytorch_backend_config
:
use_cuda_graph
:
true
cuda_graph_padding_enabled
:
true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes
:
-
1
-
2
-
4
-
8
-
16
-
32
-
64
-
128
-
256
print_iter_log
:
true
enable_overlap_scheduler
:
true
kv_cache_dtype
:
fp8
examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml
0 → 100644
View file @
b6774b88
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Frontend
:
# This is the client-facing model name, you can set this to anything you'd like.
served_model_name
:
"
nvidia/DeepSeek-R1-FP4"
endpoint_chat
:
dynamo.Processor.chat/completions
endpoint_completions
:
dynamo.Processor.completions
port
:
8000
Processor
:
engine_args
:
"
configs/deepseek_r1/agg_llm_api_config.yaml"
router
:
round-robin
remote-prefill
:
true
# Parallelize preprocessing/tokenization to avoid bottlenecks
ServiceArgs
:
workers
:
5
TensorRTLLMWorker
:
engine_args
:
"
configs/deepseek_r1/agg_llm_api_config.yaml"
llmapi-disaggregated-config
:
"
configs/deepseek_r1/disagg_llm_api_config.yaml"
remote-prefill
:
true
# NOTE: When testing/benchmarking multiple prefill workers, you can set
# this number to the exact amount of prefill workers if you want Dynamo to
# wait until all the prefill workers are ready before marking the decode
# worker ready.
min-prefill-workers
:
1
router
:
round-robin
ServiceArgs
:
workers
:
1
resources
:
gpu
:
4
TensorRTLLMPrefillWorker
:
engine_args
:
"
configs/deepseek_r1/agg_llm_api_config.yaml"
llmapi-disaggregated-config
:
"
configs/deepseek_r1/disagg_llm_api_config.yaml"
router
:
round-robin
ServiceArgs
:
workers
:
1
resources
:
gpu
:
4
examples/tensorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
0 → 100644
View file @
b6774b88
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Example Configs for Context & Generation on GB200 nodes
# - Context on 1xGB200 (4xB00)
# - Generation on 1xGB200 (4xB200)
# NOTE: Fields like hostname, ports, urls, num_instances, etc. only used by trtllm-serve, not by dynamo
backend
:
pytorch
context_servers
:
# Context/prefill processes many tokens at once, so for a large ISL, a large
# batch size may not be needed to saturate GPU utilization.
max_batch_size
:
1
max_num_tokens
:
8192
max_seq_len
:
8192
# TP/EP/PP/DP
tensor_parallel_size
:
4
moe_expert_parallel_size
:
4
pipeline_parallel_size
:
1
enable_attention_dp
:
true
free_gpu_memory_fraction
:
0.75
pytorch_backend_config
:
print_iter_log
:
true
# NOTE: This dtype must match in both context/generation configs
kv_cache_dtype
:
fp8
generation_servers
:
# Generation/decode processes one token per request at a time, so a larger
# batch size helps to saturate GPU utilization.
max_batch_size
:
256
max_num_tokens
:
256
# 8448 = 8192 ISL + 256 OSL
max_seq_len
:
8448
# TP/EP/PP/DP
tensor_parallel_size
:
4
moe_expert_parallel_size
:
4
pipeline_parallel_size
:
1
enable_attention_dp
:
false
# With dp attention disabled: high free_gpu_memory_fraction is fine.
free_gpu_memory_fraction
:
0.85
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
pytorch_backend_config
:
use_cuda_graph
:
true
cuda_graph_padding_enabled
:
true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes
:
-
1
-
2
-
4
-
8
-
16
-
32
-
64
-
128
-
256
print_iter_log
:
true
enable_overlap_scheduler
:
true
# NOTE: This dtype must match in both context/generation configs
kv_cache_dtype
:
fp8
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment