Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
03d976c7
Unverified
Commit
03d976c7
authored
Jun 26, 2025
by
Tanmay Verma
Committed by
GitHub
Jun 26, 2025
Browse files
refactor: Refactor the TRTLLM example components and improve UI (#1654)
Signed-off-by:
Tanmay Verma
<
tanmayv@nvidia.com
>
parent
8a2d6529
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
52 additions
and
258 deletions
+52
-258
examples/tensorrt_llm/configs/disagg.yaml
examples/tensorrt_llm/configs/disagg.yaml
+13
-6
examples/tensorrt_llm/configs/disagg_router.yaml
examples/tensorrt_llm/configs/disagg_router.yaml
+12
-6
examples/tensorrt_llm/configs/engine_configs/agg_config.yaml
examples/tensorrt_llm/configs/engine_configs/agg_config.yaml
+0
-9
examples/tensorrt_llm/configs/engine_configs/decode_config.yaml
...es/tensorrt_llm/configs/engine_configs/decode_config.yaml
+12
-5
examples/tensorrt_llm/configs/engine_configs/prefill_config.yaml
...s/tensorrt_llm/configs/engine_configs/prefill_config.yaml
+13
-4
examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
...llm/configs/llmapi_disagg_configs/single_node_config.yaml
+0
-54
examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
...figs/llmapi_disagg_router_configs/single_node_config.yaml
+0
-60
examples/tensorrt_llm/engines/trtllm_engine.py
examples/tensorrt_llm/engines/trtllm_engine.py
+0
-112
tests/serve/test_dynamo_serve.py
tests/serve/test_dynamo_serve.py
+2
-2
No files found.
examples/tensorrt_llm/configs/disagg.yaml
View file @
03d976c7
...
...
@@ -20,22 +20,29 @@ Frontend:
router
:
round-robin
TensorRTLLMWorker
:
# Path to disk model or HuggingFace model identifier to load
model-path
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# Name to serve the model under
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
engine_args
:
"
configs/llm_api_config.yaml"
llmapi-disaggregated-config
:
"
configs/llmapi_disagg_configs/single_node_config.yaml"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args
:
"
configs/engine_configs/decode_config.yaml"
enable-disagg
:
true
router
:
round-robin
remote-prefill
:
true
min-prefill-workers
:
1
ServiceArgs
:
workers
:
1
resources
:
gpu
:
1
TensorRTLLMPrefillWorker
:
engine_args
:
"
configs/llm_api_config.yaml"
llmapi-disaggregated-config
:
"
configs/llmapi_disagg_configs/single_node_config.yaml"
# Path to disk model or HuggingFace model identifier to load
model-path
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args
:
"
configs/engine_configs/prefill_config.yaml"
router
:
round-robin
ServiceArgs
:
workers
:
1
resources
:
gpu
:
1
examples/tensorrt_llm/configs/disagg_router.yaml
View file @
03d976c7
...
...
@@ -20,20 +20,26 @@ Frontend:
router
:
kv
TensorRTLLMWorker
:
# Path to disk model or HuggingFace model identifier to load
model-path
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# Name to serve the model under
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
engine_args
:
"
configs/llm_api_config_router.yaml"
llmapi-disaggregated-config
:
"
configs/llmapi_disagg_router_configs/single_node_config.yaml"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args
:
"
configs/engine_configs/decode_config.yaml"
enable-disagg
:
true
router
:
kv
remote-prefill
:
true
min-prefill-workers
:
1
ServiceArgs
:
workers
:
1
resources
:
gpu
:
1
TensorRTLLMPrefillWorker
:
engine_args
:
"
configs/llm_api_config_router.yaml"
llmapi-disaggregated-config
:
"
configs/llmapi_disagg_router_configs/single_node_config.yaml"
# Path to disk model or HuggingFace model identifier to load
model-path
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args
:
"
configs/engine_configs/prefill_config.yaml"
router
:
round-robin
ServiceArgs
:
workers
:
1
...
...
examples/tensorrt_llm/configs/
llm_api
_config.yaml
→
examples/tensorrt_llm/configs/
engine_configs/agg
_config.yaml
View file @
03d976c7
...
...
@@ -12,15 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# In the case of disaggregated deployment, this config will apply to each server
# and will be overwritten by the disaggregated config file
# TODO: figure out how to generate this from the service config or vice versa
model_name
:
"
deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model_path
:
null
tensor_parallel_size
:
1
moe_expert_parallel_size
:
1
enable_attention_dp
:
false
...
...
examples/tensorrt_llm/
graphs/disagg_router.py
→
examples/tensorrt_llm/
configs/engine_configs/decode_config.yaml
View file @
03d976c7
...
...
@@ -12,9 +12,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size
:
1
moe_expert_parallel_size
:
1
enable_attention_dp
:
false
max_num_tokens
:
8192
max_batch_size
:
16
trust_remote_code
:
true
backend
:
pytorch
enable_chunked_prefill
:
true
disable_overlap_scheduler
:
false
use_cuda_graph
:
true
kv_cache_config
:
free_gpu_memory_fraction
:
0.95
from
components.frontend
import
Frontend
from
components.prefill_worker
import
TensorRTLLMPrefillWorker
from
components.worker
import
TensorRTLLMWorker
Frontend
.
link
(
TensorRTLLMWorker
).
link
(
TensorRTLLMPrefillWorker
)
examples/tensorrt_llm/
graphs/agg_router.py
→
examples/tensorrt_llm/
configs/engine_configs/prefill_config.yaml
View file @
03d976c7
...
...
@@ -12,8 +12,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size
:
1
moe_expert_parallel_size
:
1
enable_attention_dp
:
false
max_num_tokens
:
8192
max_batch_size
:
16
trust_remote_code
:
true
backend
:
pytorch
enable_chunked_prefill
:
true
# Overlap scheduler not currently supported in prefill only workers.
disable_overlap_scheduler
:
true
use_cuda_graph
:
false
from
components.frontend
import
Frontend
from
components.worker
import
TensorRTLLMWorker
Frontend
.
link
(
TensorRTLLMWorker
)
kv_cache_config
:
free_gpu_memory_fraction
:
0.95
examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
deleted
100644 → 0
View file @
8a2d6529
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This will overwrite the llm_api_config.yaml
# TODO: Specifying the context and generation servers in the config file is
# bit confusing. Investigate if we can clean this up.
hostname
:
localhost
port
:
8080
trust_remote_code
:
true
backend
:
pytorch
context_servers
:
num_instances
:
1
tensor_parallel_size
:
1
max_num_tokens
:
10240
max_batch_size
:
16
enable_chunked_prefill
:
false
kv_cache_config
:
free_gpu_memory_fraction
:
0.75
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler
:
true
use_cuda_graph
:
false
urls
:
-
"
localhost:8001"
generation_servers
:
num_instances
:
1
tensor_parallel_size
:
1
max_num_tokens
:
256
max_batch_size
:
256
kv_cache_config
:
free_gpu_memory_fraction
:
0.75
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler
:
false
use_cuda_graph
:
false
urls
:
-
"
localhost:8002"
examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
deleted
100644 → 0
View file @
8a2d6529
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This will overwrite the llm_api_config.yaml
# TODO: Specifying the context and generation servers in the config file is
# bit confusing. Investigate if we can clean this up.
hostname
:
localhost
port
:
8080
trust_remote_code
:
true
backend
:
pytorch
context_servers
:
num_instances
:
1
tensor_parallel_size
:
1
max_num_tokens
:
10240
max_batch_size
:
16
enable_chunked_prefill
:
false
kv_cache_config
:
free_gpu_memory_fraction
:
0.75
event_buffer_max_size
:
1024
enable_block_reuse
:
true
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler
:
true
use_cuda_graph
:
false
enable_iter_perf_stats
:
true
urls
:
-
"
localhost:8001"
generation_servers
:
num_instances
:
1
tensor_parallel_size
:
1
max_num_tokens
:
256
max_batch_size
:
256
kv_cache_config
:
free_gpu_memory_fraction
:
0.75
event_buffer_max_size
:
1024
enable_block_reuse
:
true
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler
:
false
use_cuda_graph
:
false
enable_iter_perf_stats
:
true
urls
:
-
"
localhost:8002"
examples/tensorrt_llm/engines/trtllm_engine.py
deleted
100644 → 0
View file @
8a2d6529
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
IMPORTANT:
- This is only supposed to be used by dynamo-run launcher.
- It is part of bring-your-own-engine python feature in dynamo-run.
"""
import
json
import
os
import
sys
from
pathlib
import
Path
from
tensorrt_llm.logger
import
logger
from
dynamo.runtime
import
dynamo_endpoint
# Add the project root to the Python path
project_root
=
str
(
Path
(
__file__
).
parents
[
1
])
# Go up to llm directory
if
project_root
not
in
sys
.
path
:
sys
.
path
.
append
(
project_root
)
from
common.base_engine
import
BaseTensorrtLLMEngine
,
get_sampling_params
# noqa: E402
from
common.chat_processor
import
ChatProcessorMixin
# noqa: E402
from
common.parser
import
LLMAPIConfig
,
parse_dynamo_run_args
# noqa: E402
from
common.protocol
import
(
# noqa: E402
DynamoTRTLLMChatCompletionRequest
,
DynamoTRTLLMChatCompletionStreamResponse
,
)
from
common.utils
import
ServerType
# noqa: E402
logger
.
set_level
(
os
.
getenv
(
"DYN_TRTLLM_LOG_LEVEL"
,
"info"
))
class
Processor
(
ChatProcessorMixin
):
def
__init__
(
self
,
engine_config
:
LLMAPIConfig
):
super
().
__init__
(
engine_config
,
using_engine_generator
=
True
)
def
preprocess
(
self
,
request
):
return
super
().
preprocess
(
request
)
def
postprocess
(
self
,
engine_generator
,
request
,
conversation
):
return
super
().
postprocess
(
engine_generator
,
request
,
conversation
)
async
def
chat_generator
(
engine
:
BaseTensorrtLLMEngine
,
request
):
if
engine
.
_llm_engine
is
None
:
raise
RuntimeError
(
"Engine not initialized"
)
logger
.
debug
(
f
"Received chat request:
{
request
}
"
)
preprocessed_request
=
await
engine
.
processor
.
chat_processor
.
preprocess
(
request
)
engine_generator
=
engine
.
_llm_engine
.
generate_async
(
inputs
=
preprocessed_request
.
prompt
,
sampling_params
=
get_sampling_params
(
preprocessed_request
.
sampling_params
),
disaggregated_params
=
None
,
streaming
=
True
,
)
async
for
raw_response
in
engine
.
processor
.
chat_processor
.
postprocess
(
engine_generator
,
request
,
preprocessed_request
.
conversation
):
response
=
DynamoTRTLLMChatCompletionStreamResponse
.
model_validate_json
(
raw_response
)
yield
json
.
loads
(
response
.
model_dump_json
(
exclude_unset
=
True
))
class
DynamoTRTLLMEngine
(
BaseTensorrtLLMEngine
):
"""
Request handler for the generate endpoint
"""
def
__init__
(
self
,
engine_config
:
LLMAPIConfig
):
super
().
__init__
(
engine_config
=
engine_config
,
server_type
=
ServerType
.
DYN_RUN
)
self
.
processor
=
Processor
(
engine_config
)
# Initialize the engine
self
.
_init_engine
()
engine
=
None
# Global variable to store the engine instance. This is initialized in the main function.
def
init_global_engine
(
args
,
engine_config
):
global
engine
logger
.
debug
(
f
"Received args:
{
args
}
"
)
logger
.
info
(
f
"Initializing global engine with engine config:
{
engine_config
}
"
)
engine
=
DynamoTRTLLMEngine
(
engine_config
)
@
dynamo_endpoint
(
DynamoTRTLLMChatCompletionRequest
,
DynamoTRTLLMChatCompletionStreamResponse
)
async
def
generate
(
request
):
async
for
response
in
chat_generator
(
engine
,
request
):
yield
response
if
__name__
==
"__main__"
:
args
,
engine_config
=
parse_dynamo_run_args
()
init_global_engine
(
args
,
engine_config
)
tests/serve/test_dynamo_serve.py
View file @
03d976c7
...
...
@@ -199,7 +199,7 @@ deployment_graphs = {
),
"trtllm_agg_router"
:
(
DeploymentGraph
(
module
=
"graphs.agg
_router
:Frontend"
,
module
=
"graphs.agg:Frontend"
,
config
=
"configs/agg_router.yaml"
,
directory
=
"/workspace/examples/tensorrt_llm"
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
...
...
@@ -231,7 +231,7 @@ deployment_graphs = {
),
"trtllm_disagg_router"
:
(
DeploymentGraph
(
module
=
"graphs.disagg
_router
:Frontend"
,
module
=
"graphs.disagg:Frontend"
,
config
=
"configs/disagg_router.yaml"
,
directory
=
"/workspace/examples/tensorrt_llm"
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment