refactor: Refactor the TRTLLM example components and improve UI (#1654)

Signed-off-by: Tanmay Verma <tanmayv@nvidia.com>

refactor: Refactor the TRTLLM example components and improve UI (#1654)
Signed-off-by: Tanmay Verma <tanmayv@nvidia.com>
03d976c7 · Tanmay Verma · GitHub · 8a2d6529 · 03d976c7 · 03d976c7
Unverified Commit 03d976c7 authored Jun 26, 2025 by Tanmay Verma Committed by GitHub Jun 26, 2025
20 changed files
--- a/examples/tensorrt_llm/README.md
+++ b/examples/tensorrt_llm/README.md
@@ -110,7 +110,7 @@ dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml
 #### Aggregated serving with KV Routing
 ```bash
 cd /workspace/examples/tensorrt_llm
-dynamo serve graphs.agg_router:Frontend -f ./configs/agg_router.yaml
+dynamo serve graphs.agg:Frontend -f ./configs/agg_router.yaml
 ```

 #### Disaggregated serving
@@ -122,7 +122,7 @@ dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
 #### Disaggregated serving with KV Routing
 ```bash
 cd /workspace/examples/tensorrt_llm
-dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
+dynamo serve graphs.disagg:Frontend -f ./configs/disagg_router.yaml
 ```

 #### Aggregated serving with Multi-Token Prediction (MTP) and DeepSeek R1

--- a/examples/tensorrt_llm/common/base_engine.py
+++ b/examples/tensorrt_llm/common/base_engine.py
--- a/examples/tensorrt_llm/common/parser.py
+++ b/examples/tensorrt_llm/common/parser.py
@@ -14,136 +14,28 @@
 # limitations under the License.

 import argparse
-import os
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Dict, Tuple
-
-import yaml
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-from tensorrt_llm.llmapi import KvCacheConfig
-from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
-
-
-@dataclass
-class LLMAPIConfig:
-    def __init__(
-        self,
-        model_name: str,
-        model_path: str | None = None,
-        pytorch_backend_config: PyTorchConfig | None = None,
-        kv_cache_config: KvCacheConfig | None = None,
-        speculative_config: DecodingBaseConfig | None = None,
-        **kwargs,
-    ):
-        self.model_name = model_name
-        self.model_path = model_path
-        self.pytorch_backend_config = pytorch_backend_config
-        self.kv_cache_config = kv_cache_config
-        self.speculative_config = speculative_config
-        self.extra_args = kwargs
-
-        # Hardcoded to skip tokenizer init for now.
-        # We will handle the tokenization/detokenization
-        # in the base engine.
-        if "skip_tokenizer_init" in self.extra_args:
-            self.extra_args.pop("skip_tokenizer_init")
-        self.skip_tokenizer_init = True
-
-    def to_dict(self) -> Dict[str, Any]:
-        data = {
-            "kv_cache_config": self.kv_cache_config,
-            "speculative_config": self.speculative_config,
-            "skip_tokenizer_init": self.skip_tokenizer_init,
-        }
-        if self.extra_args:
-            data.update(self.extra_args)
-        return data
-
-    def update_sub_configs(self, other_config: Dict[str, Any]):
-        # TODO: Consider removing pytorch_backend_config parsing as this section
-        # was collapsed to top level config fields in recent TRTLLM versions.
-        if "pytorch_backend_config" in other_config:
-            self.pytorch_backend_config = PyTorchConfig(
-                **other_config["pytorch_backend_config"]
-            )
-            self.extra_args.pop("pytorch_backend_config", None)
-
-        if "kv_cache_config" in other_config:
-            self.kv_cache_config = KvCacheConfig(**other_config["kv_cache_config"])
-            self.extra_args.pop("kv_cache_config", None)
-
-        if "speculative_config" in other_config:
-            self.speculative_config = DecodingBaseConfig.from_dict(
-                other_config["speculative_config"]
-            )
-            self.extra_args.pop("speculative_config", None)
-
-
-def _get_llm_args(engine_config):
-    # Only do model validation checks and leave other checks to LLMAPI
-    if "model_name" not in engine_config:
-        raise ValueError("Model name is required in the TRT-LLM engine config.")
-
-    if engine_config.get("model_path", ""):
-        if os.path.exists(engine_config.get("model_path", "")):
-            engine_config["model_path"] = Path(engine_config["model_path"])
-        else:
-            raise ValueError(f"Model path {engine_config['model_path']} does not exist")
-
-    model_name = engine_config["model_name"]
-    model_path = engine_config.get("model_path", None)
-
-    engine_config.pop("model_name")
-    engine_config.pop("model_path", None)
-
-    # Store all other args as kwargs
-    llm_api_config = LLMAPIConfig(
-        model_name=model_name,
-        model_path=model_path,
-        **engine_config,
-    )
-    # Parse supported sub configs and remove from kwargs
-    llm_api_config.update_sub_configs(engine_config)
-
-    return llm_api_config
-
-
-def _init_engine_args(engine_args_filepath):
-    """Initialize engine arguments from config file."""
-    if not os.path.isfile(engine_args_filepath):
-        raise ValueError(
-            "'YAML file containing TRT-LLM engine args must be provided in when launching the worker."
-        )
-
-    try:
-        with open(engine_args_filepath) as file:
-            trtllm_engine_config = yaml.safe_load(file)
-    except yaml.YAMLError as e:
-        raise RuntimeError(f"Failed to parse engine config: {e}")
-
-    return _get_llm_args(trtllm_engine_config)


 def parse_tensorrt_llm_args(
    config_args,
-) -> Tuple[Any, Tuple[Dict[str, Any], Dict[str, Any]]]:
+) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="A TensorRT-LLM Worker parser")
    parser.add_argument(
-        "--engine_args", type=str, required=True, help="Path to the engine args file"
+        "--extra-engine-args",
+        type=str,
+        default="",
+        help="Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.",
    )
    parser.add_argument(
-        "--served_model_name",
+        "--model-path",
        type=str,
-        help="Name of the model to serve",
        default=None,
+        help="Path to disk model or HuggingFace model identifier to load.",
    )
    parser.add_argument(
-        "--llmapi-disaggregated-config",
-        "-c",
+        "--served_model_name",
        type=str,
-        help="Path to the llmapi disaggregated config file",
-        default=None,
+        help="Name to serve the model under.",
    )
    parser.add_argument(
        "--router",
@@ -152,46 +44,19 @@ def parse_tensorrt_llm_args(
        default="random",
        help="Router type to use for scheduling requests to workers",
    )
+
    parser.add_argument(
-        "--min-workers",
-        type=int,
-        default=1,
-        help="Minimum number of workers for aggregated (monolith) server",
-    )
-    parser.add_argument(
-        "--min-prefill-workers",
-        type=int,
-        default=1,
-        help="Minimum number of prefill workers for disaggregated server",
-    )
-    parser.add_argument(
-        "--block-size",
+        "--kv-block-size",
        type=int,
        default=32,
        help="Number of tokens per KV block in TRTLLM worker. Default is 32 for pytorch backend.",
    )
-    parser.add_argument(
-        "--remote-prefill",
-        action="store_true",
-        help="Use remote prefill workers for generation server in Disaggregated mode.",
-    )
-
-    args = parser.parse_args(config_args)
-    return (args, _init_engine_args(args.engine_args))

-
-def parse_dynamo_run_args() -> Tuple[Any, Tuple[Dict[str, Any], Dict[str, Any]]]:
-    parser = argparse.ArgumentParser(
-        description="A TensorRT-LLM Dynamo-run engine parser"
-    )
    parser.add_argument(
-        "--engine_args", type=str, required=True, help="Path to the engine args file"
-    )
-    parser.add_argument(
-        "--publish-kv-cache-events",
+        "--enable-disagg",
        action="store_true",
-        help="Publish KV cache events from TensorRT-LLM. Currently, only supported for context worker in Disaggregated mode.",
+        help="Enable remote prefill for the worker",
    )

-    args, _ = parser.parse_known_args()
-    return (args, _init_engine_args(args.engine_args))
+    args = parser.parse_args(config_args)
+    return args
--- a/examples/tensorrt_llm/common/utils.py
+++ b/examples/tensorrt_llm/common/utils.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-import logging
-import threading
-import traceback
-import weakref
-from enum import Enum
-from queue import Queue
-from typing import Any, Callable, Coroutine, Optional, TypedDict, Union
-
-logger = logging.getLogger(__name__)
-
-AsyncTask = Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
-
-
-class RoutingStrategy(Enum):
-    ROUND_ROBIN = "round_robin"
-    RANDOM = "random"
-    PREFIX = "prefix"
-
-
-class RequestType(Enum):
-    CHAT = "chat"
-    COMPLETION = "completion"
-
-
-class ServerType(Enum):
-    # Generation server used for disaggregated and aggregated requests
-    GEN = "gen"
-    # Context server used for disaggregated requests
-    CTX = "ctx"
-    # Dynamo run server used for Dynamo run requests
-    DYN_RUN = "dyn_run"
-
-
-class ConversationMessage(TypedDict):
-    role: str
-    content: str
-
-
-class ManagedThread(threading.Thread):
-    def __init__(
-        self,
-        task: Optional[AsyncTask],
-        error_queue: Optional[Queue] = None,
-        name: Optional[str] = None,
-        loop: Optional[asyncio.AbstractEventLoop] = None,
-        **kwargs,
-    ):
-        super().__init__(name=name)
-        self.task = task
-        self.error_queue = error_queue
-        self.kwargs = kwargs
-        self.loop = loop
-        self.daemon = True
-
-        self.stop_event = threading.Event()
-
-    def set_loop(self, loop: asyncio.AbstractEventLoop):
-        self.loop = loop
-
-    def run(self):
-        while not self.stop_event.is_set():
-            task: Optional[AsyncTask] = self.task
-            if isinstance(task, weakref.WeakMethod):
-                task = task()
-                if task is None:
-                    # Normally, this should not happen.
-                    logger.warning("WeakMethod is expired.")
-                    break
-
-            if task is None:
-                break
-
-            try:
-                if self.loop is None:
-                    logger.error("[ManagedThread] Loop not initialized!")
-                    break
-                future = asyncio.run_coroutine_threadsafe(
-                    task(**self.kwargs), self.loop
-                )
-                _ = future.result()
-            except Exception as e:
-                logger.error(
-                    f"Error in thread {self.name}: {e}\n{traceback.format_exc()}"
-                )
-                if self.error_queue is not None:
-                    self.error_queue.put(e)
-
-        logger.info(f"Thread {self.name} stopped.")
-
-    def stop(self):
-        self.stop_event.set()
--- a/examples/tensorrt_llm/components/prefill_worker.py
+++ b/examples/tensorrt_llm/components/prefill_worker.py
@@ -12,15 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import asyncio
 import logging

-from common.base_engine import BaseTensorrtLLMEngine
+from common.base_engine import BaseEngineConfig, BaseTensorrtLLMEngine
 from common.parser import parse_tensorrt_llm_args
 from common.protocol import TRTLLMWorkerRequest
-from common.utils import ServerType

-from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
+from dynamo.sdk import async_on_start, dynamo_context, endpoint, on_shutdown, service
 from dynamo.sdk.lib.config import ServiceConfig

 logger = logging.getLogger(__name__)
@@ -39,34 +37,37 @@ class TensorRTLLMPrefillWorker(BaseTensorrtLLMEngine):
        class_name = self.__class__.__name__
        config = ServiceConfig.get_instance()
        config_args = config.as_args(class_name, prefix="")
-        args, engine_config = parse_tensorrt_llm_args(config_args)
-        worker_id = dynamo_context["endpoints"][0].lease_id()
-        super().__init__(
-            namespace_str="dynamo",
-            component_str=class_name,
-            worker_id=worker_id,
-            engine_config=engine_config,
-            remote_prefill=args.remote_prefill,
-            min_workers=args.min_workers,
-            disagg_config_file=args.llmapi_disaggregated_config,
-            block_size=args.block_size,
-            router=args.router,
-            server_type=ServerType.CTX,
+        args = parse_tensorrt_llm_args(config_args)
+        lease_id = dynamo_context["endpoints"][0].lease_id()
+        namespace, _ = TensorRTLLMPrefillWorker.dynamo_address()  # type: ignore
+
+        engine_config = BaseEngineConfig(
+            namespace=namespace,
+            component=class_name,
+            endpoint="generate",
+            model_path=args.model_path,
+            served_model_name=args.served_model_name,
+            kv_block_size=args.kv_block_size,
+            extra_engine_args=args.extra_engine_args,
+            publish_events_and_metrics=False,
+            disaggregation_mode="prefill",
+            remote_prefill_endpoint=None,
+            lease_id=lease_id,
        )

+        super().__init__(config=engine_config)
+
    @async_on_start
    async def async_init(self):
-        self._init_engine()
-        if self._kv_metrics_publisher is not None:
-            task = asyncio.create_task(self.create_metrics_publisher_endpoint())
-            task.add_done_callback(
-                lambda _: logger.info("metrics publisher endpoint created")
-            )
+        runtime = dynamo_context["runtime"]
+        await self.initialize(runtime)
        logger.info("TensorRT-LLM Prefill Worker initialized")

-    async def create_metrics_publisher_endpoint(self):
-        component = dynamo_context["component"]
-        await self.kv_metrics_publisher.create_endpoint(component)
+    @on_shutdown
+    async def async_cleanup(self):
+        logger.info("Cleaning up TensorRT-LLM Prefill Worker")
+        await self.cleanup()
+        logger.info("TensorRT-LLM Prefill Worker cleanup completed")

    @endpoint()
    async def generate(self, request: TRTLLMWorkerRequest):

--- a/examples/tensorrt_llm/components/worker.py
+++ b/examples/tensorrt_llm/components/worker.py
@@ -12,17 +12,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import asyncio
 import logging

-from common.base_engine import BaseTensorrtLLMEngine
+from common.base_engine import BaseEngineConfig, BaseTensorrtLLMEngine
 from common.parser import parse_tensorrt_llm_args
 from common.protocol import TRTLLMWorkerRequest
-from common.utils import ServerType
 from components.prefill_worker import TensorRTLLMPrefillWorker

 from dynamo.llm import ModelType, register_llm
-from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
+from dynamo.sdk import (
+    async_on_start,
+    depends,
+    dynamo_context,
+    endpoint,
+    on_shutdown,
+    service,
+)
 from dynamo.sdk.lib.config import ServiceConfig

 logger = logging.getLogger(__name__)
@@ -43,74 +48,66 @@ class TensorRTLLMWorker(BaseTensorrtLLMEngine):
        class_name = self.__class__.__name__
        config = ServiceConfig.get_instance()
        config_args = config.as_args(class_name, prefix="")
-        args, engine_config = parse_tensorrt_llm_args(config_args)
-        self.served_model_name = args.served_model_name
-        worker_id = dynamo_context["endpoints"][0].lease_id()
+        args = parse_tensorrt_llm_args(config_args)
+        lease_id = dynamo_context["endpoints"][0].lease_id()
        namespace, _ = TensorRTLLMWorker.dynamo_address()  # type: ignore
-        self._min_prefill_workers = args.min_prefill_workers
-        super().__init__(
-            namespace_str=namespace,
-            component_str=class_name,
-            worker_id=worker_id,
-            engine_config=engine_config,
-            remote_prefill=args.remote_prefill,
-            min_workers=args.min_workers,
-            disagg_config_file=args.llmapi_disaggregated_config,
-            block_size=args.block_size,
-            router=args.router,
-            server_type=ServerType.GEN,
+        endpoint_name = "generate"
+        publish_events_and_metrics = args.router == "kv"
+        prefill_class_name = "TensorRTLLMPrefillWorker"
+
+        if args.enable_disagg:
+            disaggregation_mode = "decode"
+        else:
+            disaggregation_mode = "prefill_and_decode"
+
+        engine_config = BaseEngineConfig(
+            namespace=namespace,
+            component=class_name,
+            endpoint=endpoint_name,
+            model_path=args.model_path,
+            served_model_name=args.served_model_name,
+            kv_block_size=args.kv_block_size,
+            extra_engine_args=args.extra_engine_args,
+            publish_events_and_metrics=publish_events_and_metrics,
+            disaggregation_mode=disaggregation_mode,
+            remote_prefill_endpoint=f"dyn://{namespace}.{prefill_class_name}.generate",
+            lease_id=lease_id,
        )

+        super().__init__(config=engine_config)
+
    @async_on_start
    async def async_init(self):
-        self._init_engine()
-
        runtime = dynamo_context["runtime"]
+        await self.initialize(runtime)
+
        logger.info("Registering LLM for discovery")
-        comp_ns, comp_name = TensorRTLLMWorker.dynamo_address()  # type: ignore
-        endpoint = runtime.namespace(comp_ns).component(comp_name).endpoint("generate")
+        endpoint = (
+            runtime.namespace(self._config.namespace)
+            .component(self._config.component)
+            .endpoint(self._config.endpoint)
+        )

        try:
            await register_llm(
                ModelType.Backend,
                endpoint,
-                self._engine_config.model_name,
-                self.served_model_name,
-                kv_cache_block_size=self._kv_block_size,
+                self._config.model_path,
+                self._config.served_model_name,
+                kv_cache_block_size=self._config.kv_block_size,
            )
            logger.info("Successfully registered LLM for discovery")
        except Exception as e:
            logger.error(f"Failed to register LLM for discovery: {e}")
            raise

-        if self._remote_prefill:
-            runtime = dynamo_context["runtime"]
-            comp_ns, comp_name = TensorRTLLMPrefillWorker.dynamo_address()  # type: ignore
-            self._prefill_client = (
-                await runtime.namespace(comp_ns)
-                .component(comp_name)
-                .endpoint("generate")
-                .client()
-            )
-            while len(self._prefill_client.instance_ids()) < self._min_prefill_workers:
-                logger.info(
-                    f"Waiting for prefill workers to be ready.\n"
-                    f" Current: {len(self._prefill_client.instance_ids())},"
-                    f" Required: {self._min_prefill_workers}"
-                )
-                await asyncio.sleep(30)
-
-        if self._kv_metrics_publisher is not None:
-            task = asyncio.create_task(self.create_metrics_publisher_endpoint())
-            task.add_done_callback(
-                lambda _: logger.info("metrics publisher endpoint created")
-            )
-
        logger.info("TensorRT-LLM Worker initialized")

-    async def create_metrics_publisher_endpoint(self):
-        component = dynamo_context["component"]
-        await self._kv_metrics_publisher.create_endpoint(component)
+    @on_shutdown
+    async def async_cleanup(self):
+        logger.info("Cleaning up TensorRT-LLM Worker")
+        await self.cleanup()
+        logger.info("TensorRT-LLM Worker cleanup completed")

    @endpoint()
    async def generate(self, request: TRTLLMWorkerRequest):

--- a/examples/tensorrt_llm/configs/agg.yaml
+++ b/examples/tensorrt_llm/configs/agg.yaml
@@ -20,8 +20,13 @@ Frontend:
  router: round-robin

 TensorRTLLMWorker:
+  # Path to disk model or HuggingFace model identifier to load
+  model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  # Name to serve the model under
  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  engine_args: "configs/llm_api_config.yaml"
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/engine_configs/agg_config.yaml"
  router: round-robin
  ServiceArgs:
    workers: 1

--- a/examples/tensorrt_llm/configs/agg_router.yaml
+++ b/examples/tensorrt_llm/configs/agg_router.yaml
@@ -20,9 +20,15 @@ Frontend:
  router: kv

 TensorRTLLMWorker:
-  engine_args: "configs/llm_api_config_router.yaml"
+  # Path to disk model or HuggingFace model identifier to load
+  model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  # Name to serve the model under
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/engine_configs/agg_config.yaml"
  router: kv
  ServiceArgs:
    workers: 1
    resources:
-      gpu: 1
+      gpu: 1
\ No newline at end of file
--- a/examples/tensorrt_llm/configs/deepseek_r1/agg.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/agg.yaml
@@ -22,7 +22,12 @@ Frontend:

 TensorRTLLMWorker:
  served_model_name: "nvidia/DeepSeek-R1-FP4"
-  engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
+  # NOTE: FP4 only supported starting with Blackwell GPUs.
+  # https://huggingface.co/nvidia/DeepSeek-R1-FP4
+  # You can also specify the full path to locally downloaded weights
+  # instead of a HuggingFace ID here.
+  model-path: "nvidia/DeepSeek-R1-FP4"
+  extra-engine-args: "configs/deepseek_r1/engine_configs/agg_config.yaml"
  router: round-robin
  ServiceArgs:
    workers: 1

--- a/examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml
@@ -22,14 +22,13 @@ Frontend:

 TensorRTLLMWorker:
  served_model_name: "nvidia/DeepSeek-R1-FP4"
-  engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
-  llmapi-disaggregated-config: "configs/deepseek_r1/disagg_llm_api_config.yaml"
-  remote-prefill: true
-  # NOTE: When testing/benchmarking multiple prefill workers, you can set
-  # this number to the exact amount of prefill workers if you want Dynamo to
-  # wait until all the prefill workers are ready before marking the decode
-  # worker ready.
-  min-prefill-workers: 1
+  # NOTE: FP4 only supported starting with Blackwell GPUs.
+  # https://huggingface.co/nvidia/DeepSeek-R1-FP4
+  # You can also specify the full path to locally downloaded weights
+  # instead of a HuggingFace ID here.
+  model-path: "nvidia/DeepSeek-R1-FP4"
+  extra-engine-args: "configs/deepseek_r1/engine_configs/decode_config.yaml"
+  enable-disagg: true
  router: round-robin
  ServiceArgs:
    workers: 1
@@ -37,8 +36,12 @@ TensorRTLLMWorker:
      gpu: 4

 TensorRTLLMPrefillWorker:
-  engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
-  llmapi-disaggregated-config: "configs/deepseek_r1/disagg_llm_api_config.yaml"
+  # NOTE: FP4 only supported starting with Blackwell GPUs.
+  # https://huggingface.co/nvidia/DeepSeek-R1-FP4
+  # You can also specify the full path to locally downloaded weights
+  # instead of a HuggingFace ID here.
+  model-path: "nvidia/DeepSeek-R1-FP4"
+  extra-engine-args: "configs/deepseek_r1/engine_configs/prefill_config.yaml"
  router: round-robin
  ServiceArgs:
    workers: 1

--- a/examples/tensorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Example Configs for Context & Generation on GB200 nodes
-# - Context on 1xGB200 (4xB00)
-# - Generation on 1xGB200 (4xB200)
-
-# NOTE: Fields like hostname, ports, urls, num_instances, etc. only used by trtllm-serve, not by dynamo
-
-backend: pytorch
-
-context_servers:
-  # Context/prefill processes many tokens at once, so for a large ISL, a large
-  # batch size may not be needed to saturate GPU utilization.
-  max_batch_size: 1
-  max_num_tokens: 8192
-  max_seq_len: 8192
-
-  # TP/EP/PP/DP
-  tensor_parallel_size: 4
-  moe_expert_parallel_size: 4
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.75
-
-  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-  # Overlap scheduler not currently supported in context-only
-  disable_overlap_scheduler: true
-  print_iter_log: true
-  # NOTE: This dtype must match in both context/generation configs
-  kv_cache_dtype: fp8
-
-generation_servers:
-  # Generation/decode processes one token per request at a time, so a larger
-  # batch size helps to saturate GPU utilization.
-  max_batch_size: 256
-  max_num_tokens: 256
-  # 8448 = 8192 ISL + 256 OSL
-  max_seq_len: 8448
-
-  # TP/EP/PP/DP
-  tensor_parallel_size: 4
-  moe_expert_parallel_size: 4
-  pipeline_parallel_size: 1
-  enable_attention_dp: false
-
-  kv_cache_config:
-    # With dp attention disabled: high free_gpu_memory_fraction is fine.
-    free_gpu_memory_fraction: 0.85
-    # With dp attention enabled: large ISL at high concurrency may need
-    # free_gpu_memory_fraction low to have enough available memory.
-    # free_gpu_memory_fraction: 0.30
-
-  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-  disable_overlap_scheduler: false
-  use_cuda_graph: true
-  cuda_graph_padding_enabled: true
-  # NOTE: For larger max batch size, you may want to add larger cuda graph
-  # batch sizes below to match.
-  cuda_graph_batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-  print_iter_log: true
-  # NOTE: This dtype must match in both context/generation configs
-  kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
@@ -12,12 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-# NOTE: FP4 only supported starting with Blackwell GPUs.
-# https://huggingface.co/nvidia/DeepSeek-R1-FP4
-# You can also specify the full path to locally downloaded weights
-# instead of a HuggingFace ID here.
-model_name: "nvidia/DeepSeek-R1-FP4"
 backend: pytorch

 # TP/EP/PP/DP

--- a/examples/tensorrt_llm/configs/llm_api_config_router.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config_router.yaml
@@ -12,32 +12,44 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+backend: pytorch

-
-# In the case of disaggregated deployment, this config will apply to each server
-# and will be overwritten by the disaggregated config file
-
-# TODO: figure out how to generate this from the service config or vice versa
-
-model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
-model_path: null
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
+# TP/EP/PP/DP
+tensor_parallel_size: 4
+moe_expert_parallel_size: 4
+pipeline_parallel_size: 1
 enable_attention_dp: false
-max_num_tokens: 8192
-max_batch_size: 16
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
+
+max_batch_size: 256
+max_num_tokens: 256
+# 8448 = 8192 ISL + 256 OSL
+max_seq_len: 8448

 kv_cache_config:
-  free_gpu_memory_fraction: 0.95
-  event_buffer_max_size: 1024
-  enable_block_reuse: true
+  # With dp attention disabled: high free_gpu_memory_fraction is fine.
+  free_gpu_memory_fraction: 0.85
+  # With dp attention enabled: large ISL at high concurrency may need
+  # free_gpu_memory_fraction low to have enough available memory.
+  # free_gpu_memory_fraction: 0.30

 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+disable_overlap_scheduler: false
 use_cuda_graph: true
-enable_iter_perf_stats: true
+cuda_graph_padding_enabled: true
+# NOTE: For larger max batch size, you may want to add larger cuda graph
+# batch sizes below to match.
+cuda_graph_batch_sizes:
+- 1
+- 2
+- 4
+- 8
+- 16
+- 32
+- 64
+- 128
+- 256
+print_iter_log: true
+kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/llm_api_config_disagg_router.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config_disagg_router.yaml
@@ -12,32 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+backend: pytorch

+# TP/EP/PP/DP
+tensor_parallel_size: 4
+moe_expert_parallel_size: 4
+pipeline_parallel_size: 1
+enable_attention_dp: true

-# In the case of disaggregated deployment, this config will apply to each server
-# and will be overwritten by the disaggregated config file
-
-# TODO: figure out how to generate this from the service config or vice versa
-
-model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
-model_path: null
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
-enable_attention_dp: false
+max_batch_size: 1
 max_num_tokens: 8192
-max_batch_size: 16
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
+max_seq_len: 8192

 kv_cache_config:
-  free_gpu_memory_fraction: 0.95
-  event_buffer_max_size: 1024
-  enable_block_reuse: true
+  # With dp attention disabled: high free_gpu_memory_fraction is fine.
+  free_gpu_memory_fraction: 0.75
+  # With dp attention enabled: large ISL at high concurrency may need
+  # free_gpu_memory_fraction low to have enough available memory.
+  # free_gpu_memory_fraction: 0.30

 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-use_cuda_graph: true
-enable_iter_perf_stats: true
+disable_overlap_scheduler: true
+print_iter_log: true
+# NOTE: This dtype must match in both prefill/decode configs
+kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_agg_llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_agg_llm_api_config.yaml
@@ -18,7 +18,6 @@
 # You can also specify the full path to locally downloaded weights
 # instead of a HuggingFace ID here.

-model_name: "nvidia/DeepSeek-R1-FP4"
 backend: pytorch
 tensor_parallel_size: 4
 moe_expert_parallel_size: 4

--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/decode_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/decode_config.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: FP4 only supported starting with Blackwell GPUs.
+# https://huggingface.co/nvidia/DeepSeek-R1-FP4
+# You can also specify the full path to locally downloaded weights
+# instead of a HuggingFace ID here.
+
+backend: pytorch
+tensor_parallel_size: 4
+moe_expert_parallel_size: 4
+enable_attention_dp: false
+max_batch_size: 256
+# Note: When MPT is enabled and `cuda_graph_batch_sizes` is specified, `max_num_tokens` must satisfy the following formula:
+# max_num_tokens >= max(cuda_graph_batch_sizes) * (num_nextn_predict_layers + 1)
+# This is a known issue in TensorRT-LLM and will be resolved in the next release.
+max_num_tokens: 512
+# 8704 = 8192 ISL + 512 OSL
+max_seq_len: 8704
+kv_cache_config:
+  free_gpu_memory_fraction: 0.85
+
+# Enable the MTP(Multi-Token Prediction) in decode model engine
+speculative_config:
+  decoding_type: MTP
+  num_nextn_predict_layers: 1
+
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
+- 1
+- 2
+- 4
+- 8
+- 16
+- 32
+- 64
+- 128
+- 256
+print_iter_log: true
+kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: FP4 only supported starting with Blackwell GPUs.
+# https://huggingface.co/nvidia/DeepSeek-R1-FP4
+# You can also specify the full path to locally downloaded weights
+# instead of a HuggingFace ID here.
+
+backend: pytorch
+tensor_parallel_size: 4
+moe_expert_parallel_size: 4
+enable_attention_dp: true
+max_batch_size: 1
+max_num_tokens: 8192
+max_seq_len: 8192
+kv_cache_config:
+  free_gpu_memory_fraction: 0.75
+print_iter_log: true
+kv_cache_dtype: fp8
+disable_overlap_scheduler: true
+
+# Enable the MTP(Multi-Token Prediction) in the prefill model engine
+speculative_config:
+  decoding_type: MTP
+  num_nextn_predict_layers: 1
--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_agg.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_agg.yaml
@@ -21,7 +21,14 @@ Frontend:

 TensorRTLLMWorker:
  served_model_name: "nvidia/DeepSeek-R1-FP4"
-  engine_args: "configs/deepseek_r1/mtp/mtp_agg_llm_api_config.yaml"
+  # NOTE: FP4 only supported starting with Blackwell GPUs.
+  # https://huggingface.co/nvidia/DeepSeek-R1-FP4
+  # You can also specify the full path to locally downloaded weights
+  # instead of a HuggingFace ID here.
+  model-path: "nvidia/DeepSeek-R1-FP4"
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/agg_config.yaml"
  router: round-robin
  ServiceArgs:
    workers: 1

--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_disagg.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_disagg.yaml
@@ -21,19 +21,30 @@ Frontend:

 TensorRTLLMWorker:
  served_model_name: "nvidia/DeepSeek-R1-FP4"
-  engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
-  llmapi-disaggregated-config: "configs/deepseek_r1/mtp/mtp_disagg_llm_api_config.yaml"
+  # NOTE: FP4 only supported starting with Blackwell GPUs.
+  # https://huggingface.co/nvidia/DeepSeek-R1-FP4
+  # You can also specify the full path to locally downloaded weights
+  # instead of a HuggingFace ID here.
+  model-path: "nvidia/DeepSeek-R1-FP4"
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/decode_config.yaml"
  router: round-robin
-  remote-prefill: true
-  min-prefill-workers: 1
+  enable-disagg: true
  ServiceArgs:
    workers: 1
    resources:
      gpu: 4

 TensorRTLLMPrefillWorker:
-  engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
-  llmapi-disaggregated-config: "configs/deepseek_r1/mtp/mtp_disagg_llm_api_config.yaml"
+  # NOTE: FP4 only supported starting with Blackwell GPUs.
+  # https://huggingface.co/nvidia/DeepSeek-R1-FP4
+  # You can also specify the full path to locally downloaded weights
+  # instead of a HuggingFace ID here.
+  model-path: "nvidia/DeepSeek-R1-FP4"
+  # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
+  # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
+  extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml"
  router: round-robin
  ServiceArgs:
    workers: 1

--- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_disagg_llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_disagg_llm_api_config.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: FP4 only supported starting with Blackwell GPUs.
-# https://huggingface.co/nvidia/DeepSeek-R1-FP4
-# You can also specify the full path to locally downloaded weights
-# instead of a HuggingFace ID here.
-
-backend: pytorch
-
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 4
-  moe_expert_parallel_size: 4
-  enable_attention_dp: true
-  max_batch_size: 1
-  max_num_tokens: 8192
-  max_seq_len: 8192
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.75
-  print_iter_log: true
-  kv_cache_dtype: fp8
-  disable_overlap_scheduler: true
-  # Enable the MTP(Multi-Token Prediction) in the prefill model engine
-  speculative_config:
-    decoding_type: MTP
-    num_nextn_predict_layers: 1
-
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 4
-  moe_expert_parallel_size: 4
-  enable_attention_dp: false
-  max_batch_size: 256
-  # Note: When MPT is enabled and `cuda_graph_batch_sizes` is specified, `max_num_tokens` must satisfy the following formula:
-  # max_num_tokens >= max(cuda_graph_batch_sizes) * (num_nextn_predict_layers + 1)
-  # This is a known issue in TensorRT-LLM and will be resolved in the next release.
-  max_num_tokens: 512
-  # 8704 = 8192 ISL + 512 OSL
-  max_seq_len: 8704
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.85
-  # Enable the MTP(Multi-Token Prediction) in the decode model engine
-  speculative_config:
-    decoding_type: MTP
-    num_nextn_predict_layers: 1
-  use_cuda_graph: true
-  cuda_graph_padding_enabled: true
-  cuda_graph_batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-  print_iter_log: true
-  kv_cache_dtype: fp8