refactor: Refactor the TRTLLM example components and improve UI (#1654)

Signed-off-by: Tanmay Verma <tanmayv@nvidia.com>

refactor: Refactor the TRTLLM example components and improve UI (#1654)
Signed-off-by: Tanmay Verma <tanmayv@nvidia.com>
03d976c7 · Tanmay Verma · GitHub · 8a2d6529 · 03d976c7 · 03d976c7
Unverified Commit 03d976c7 authored Jun 26, 2025 by Tanmay Verma Committed by GitHub Jun 26, 2025
9 changed files
--- a/examples/tensorrt_llm/configs/disagg.yaml
+++ b/examples/tensorrt_llm/configs/disagg.yaml
@@ -20,22 +20,29 @@ Frontend:
  router: round-robin

 TensorRTLLMWorker:
+  # Path to disk model or HuggingFace model identifier to load
+  model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  # Name to serve the model under
  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  engine_args: "configs/llm_api_config.yaml"
-  llmapi-disaggregated-config: "configs/llmapi_disagg_configs/single_node_config.yaml"
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/engine_configs/decode_config.yaml"
+  enable-disagg: true
  router: round-robin
-  remote-prefill: true
-  min-prefill-workers: 1
  ServiceArgs:
    workers: 1
    resources:
      gpu: 1

 TensorRTLLMPrefillWorker:
-  engine_args: "configs/llm_api_config.yaml"
-  llmapi-disaggregated-config: "configs/llmapi_disagg_configs/single_node_config.yaml"
+  # Path to disk model or HuggingFace model identifier to load
+  model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/engine_configs/prefill_config.yaml"
  router: round-robin
  ServiceArgs:
    workers: 1
    resources:
      gpu: 1
+
--- a/examples/tensorrt_llm/configs/disagg_router.yaml
+++ b/examples/tensorrt_llm/configs/disagg_router.yaml
@@ -20,20 +20,26 @@ Frontend:
  router: kv

 TensorRTLLMWorker:
+  # Path to disk model or HuggingFace model identifier to load
+  model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  # Name to serve the model under
  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  engine_args: "configs/llm_api_config_router.yaml"
-  llmapi-disaggregated-config: "configs/llmapi_disagg_router_configs/single_node_config.yaml"
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/engine_configs/decode_config.yaml"
+  enable-disagg: true
  router: kv
-  remote-prefill: true
-  min-prefill-workers: 1
  ServiceArgs:
    workers: 1
    resources:
      gpu: 1

 TensorRTLLMPrefillWorker:
-  engine_args: "configs/llm_api_config_router.yaml"
-  llmapi-disaggregated-config: "configs/llmapi_disagg_router_configs/single_node_config.yaml"
+  # Path to disk model or HuggingFace model identifier to load
+  model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/engine_configs/prefill_config.yaml"
  router: round-robin
  ServiceArgs:
    workers: 1

--- a/examples/tensorrt_llm/configs/llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config.yaml
@@ -12,15 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-# In the case of disaggregated deployment, this config will apply to each server
-# and will be overwritten by the disaggregated config file
-
-# TODO: figure out how to generate this from the service config or vice versa
-
-model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
-model_path: null
 tensor_parallel_size: 1
 moe_expert_parallel_size: 1
 enable_attention_dp: false

--- a/examples/tensorrt_llm/graphs/disagg_router.py
+++ b/examples/tensorrt_llm/graphs/disagg_router.py
@@ -12,9 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 8192
+max_batch_size: 16
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+disable_overlap_scheduler: false
+use_cuda_graph: true
+kv_cache_config:
+  free_gpu_memory_fraction: 0.95

-from components.frontend import Frontend
-from components.prefill_worker import TensorRTLLMPrefillWorker
-from components.worker import TensorRTLLMWorker
-
-Frontend.link(TensorRTLLMWorker).link(TensorRTLLMPrefillWorker)
--- a/examples/tensorrt_llm/graphs/agg_router.py
+++ b/examples/tensorrt_llm/graphs/agg_router.py
@@ -12,8 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 8192
+max_batch_size: 16
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+# Overlap scheduler not currently supported in prefill only workers.
+disable_overlap_scheduler: true
+use_cuda_graph: false

-from components.frontend import Frontend
-from components.worker import TensorRTLLMWorker
-
-Frontend.link(TensorRTLLMWorker)
+kv_cache_config:
+  free_gpu_memory_fraction: 0.95
--- a/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This will overwrite the llm_api_config.yaml
-
-# TODO: Specifying the context and generation servers in the config file is
-# bit confusing. Investigate if we can clean this up.
-
-hostname: localhost
-port: 8080
-trust_remote_code: true
-backend: pytorch
-
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  max_num_tokens: 10240
-  max_batch_size: 16
-  enable_chunked_prefill: false
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.75
-  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-  # Overlap scheduler not currently supported in context-only
-  disable_overlap_scheduler: true
-  use_cuda_graph: false
-  urls:
-      - "localhost:8001"
-
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  max_num_tokens: 256
-  max_batch_size: 256
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.75
-  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-  disable_overlap_scheduler: false
-  use_cuda_graph: false
-  urls:
-      - "localhost:8002"
--- a/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This will overwrite the llm_api_config.yaml
-
-# TODO: Specifying the context and generation servers in the config file is
-# bit confusing. Investigate if we can clean this up.
-
-hostname: localhost
-port: 8080
-trust_remote_code: true
-backend: pytorch
-
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  max_num_tokens: 10240
-  max_batch_size: 16
-  enable_chunked_prefill: false
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.75
-    event_buffer_max_size: 1024
-    enable_block_reuse: true
-  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-  # Overlap scheduler not currently supported in context-only
-  disable_overlap_scheduler: true
-  use_cuda_graph: false
-  enable_iter_perf_stats: true
-  urls:
-      - "localhost:8001"
-
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  max_num_tokens: 256
-  max_batch_size: 256
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.75
-    event_buffer_max_size: 1024
-    enable_block_reuse: true
-  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-  disable_overlap_scheduler: false
-  use_cuda_graph: false
-  enable_iter_perf_stats: true
-  urls:
-      - "localhost:8002"
--- a/examples/tensorrt_llm/engines/trtllm_engine.py
+++ b/examples/tensorrt_llm/engines/trtllm_engine.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-IMPORTANT:
- This is only supposed to be used by dynamo-run launcher.
- It is part of bring-your-own-engine python feature in dynamo-run.
-"""
-import json
-import os
-import sys
-from pathlib import Path
-
-from tensorrt_llm.logger import logger
-
-from dynamo.runtime import dynamo_endpoint
-
-# Add the project root to the Python path
-project_root = str(Path(__file__).parents[1])  # Go up to llm directory
-if project_root not in sys.path:
-    sys.path.append(project_root)
-
-from common.base_engine import BaseTensorrtLLMEngine, get_sampling_params  # noqa: E402
-from common.chat_processor import ChatProcessorMixin  # noqa: E402
-from common.parser import LLMAPIConfig, parse_dynamo_run_args  # noqa: E402
-from common.protocol import (  # noqa: E402
-    DynamoTRTLLMChatCompletionRequest,
-    DynamoTRTLLMChatCompletionStreamResponse,
-)
-from common.utils import ServerType  # noqa: E402
-
-logger.set_level(os.getenv("DYN_TRTLLM_LOG_LEVEL", "info"))
-
-
-class Processor(ChatProcessorMixin):
-    def __init__(self, engine_config: LLMAPIConfig):
-        super().__init__(engine_config, using_engine_generator=True)
-
-    def preprocess(self, request):
-        return super().preprocess(request)
-
-    def postprocess(self, engine_generator, request, conversation):
-        return super().postprocess(engine_generator, request, conversation)
-
-
-async def chat_generator(engine: BaseTensorrtLLMEngine, request):
-    if engine._llm_engine is None:
-        raise RuntimeError("Engine not initialized")
-
-    logger.debug(f"Received chat request: {request}")
-    preprocessed_request = await engine.processor.chat_processor.preprocess(request)
-    engine_generator = engine._llm_engine.generate_async(
-        inputs=preprocessed_request.prompt,
-        sampling_params=get_sampling_params(preprocessed_request.sampling_params),
-        disaggregated_params=None,
-        streaming=True,
-    )
-    async for raw_response in engine.processor.chat_processor.postprocess(
-        engine_generator, request, preprocessed_request.conversation
-    ):
-        response = DynamoTRTLLMChatCompletionStreamResponse.model_validate_json(
-            raw_response
-        )
-        yield json.loads(response.model_dump_json(exclude_unset=True))
-
-
-class DynamoTRTLLMEngine(BaseTensorrtLLMEngine):
-    """
-    Request handler for the generate endpoint
-    """
-
-    def __init__(self, engine_config: LLMAPIConfig):
-        super().__init__(engine_config=engine_config, server_type=ServerType.DYN_RUN)
-        self.processor = Processor(engine_config)
-        # Initialize the engine
-        self._init_engine()
-
-
-engine = None  # Global variable to store the engine instance. This is initialized in the main function.
-
-
-def init_global_engine(args, engine_config):
-    global engine
-    logger.debug(f"Received args: {args}")
-    logger.info(f"Initializing global engine with engine config: {engine_config}")
-    engine = DynamoTRTLLMEngine(engine_config)
-
-
-@dynamo_endpoint(
-    DynamoTRTLLMChatCompletionRequest, DynamoTRTLLMChatCompletionStreamResponse
-)
-async def generate(request):
-    async for response in chat_generator(engine, request):
-        yield response
-
-
-if __name__ == "__main__":
-    args, engine_config = parse_dynamo_run_args()
-    init_global_engine(args, engine_config)
--- a/tests/serve/test_dynamo_serve.py
+++ b/tests/serve/test_dynamo_serve.py
@@ -199,7 +199,7 @@ deployment_graphs = {
    ),
    "trtllm_agg_router": (
        DeploymentGraph(
-            module="graphs.agg_router:Frontend",
+            module="graphs.agg:Frontend",
            config="configs/agg_router.yaml",
            directory="/workspace/examples/tensorrt_llm",
            endpoints=["v1/chat/completions", "v1/completions"],
@@ -231,7 +231,7 @@ deployment_graphs = {
    ),
    "trtllm_disagg_router": (
        DeploymentGraph(
-            module="graphs.disagg_router:Frontend",
+            module="graphs.disagg:Frontend",
            config="configs/disagg_router.yaml",
            directory="/workspace/examples/tensorrt_llm",
            endpoints=["v1/chat/completions", "v1/completions"],