feat: TRT-LLM disaggregated serving using UCX (#562)

Signed-off-by: Tanmay Verma <tanmay2592@gmail.com> Signed-off-by: Tanmay Verma <tanmayv@nvidia.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>

feat: TRT-LLM disaggregated serving using UCX (#562)
Signed-off-by: Tanmay Verma <tanmay2592@gmail.com> Signed-off-by: Tanmay Verma <tanmayv@nvidia.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>
da38e96a · Tanmay Verma · GitHub · 538b4630 · da38e96a · da38e96a
Commit da38e96a authored Apr 10, 2025 by Tanmay Verma Committed by GitHub Apr 10, 2025
6 changed files
--- a/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This will overwrite the llm_api_config.yaml
+
+# TODO: Specifying the context and generation servers in the config file is
+# bit confusing. Investigate if we can clean this up.
+
+hostname: localhost
+port: 8080
+trust_remote_code: true
+backend: pytorch
+
+context_servers:
+  num_instances: 1
+  tensor_parallel_size: 1
+  max_num_tokens: 10240
+  max_batch_size: 16
+  enable_chunked_prefill: false
+  kv_cache_config:
+    free_gpu_memory_fraction: 0.40
+  pytorch_backend_config:
+    enable_overlap_scheduler: false
+    use_cuda_graph: false
+  urls:
+      - "localhost:8001"
+
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 1
+  max_num_tokens: 256
+  max_batch_size: 256
+  kv_cache_config:
+    free_gpu_memory_fraction: 0.40
+  pytorch_backend_config:
+    enable_overlap_scheduler: true
+    use_cuda_graph: false
+  urls:
+      - "localhost:8002"
\ No newline at end of file
--- a/examples/tensorrt_llm/engines/agg_engine.py
+++ b/examples/tensorrt_llm/engines/agg_engine.py
@@ -29,15 +29,13 @@ from tensorrt_llm.logger import logger
 from dynamo.runtime import dynamo_endpoint

 # Add the project root to the Python path
-project_root = str(Path(__file__).parents[1])  # Go up to trtllm directory
+project_root = str(Path(__file__).parents[1])  # Go up to llm directory
 if project_root not in sys.path:
    sys.path.append(project_root)

-from common.base_engine import (  # noqa: E402
-    BaseTensorrtLLMEngine,
-    TensorrtLLMEngineConfig,
-)
-from common.parser import parse_dynamo_run_args  # noqa: E402
+from common.base_engine import BaseTensorrtLLMEngine, get_sampling_params  # noqa: E402
+from common.chat_processor import ChatProcessorMixin  # noqa: E402
+from common.parser import LLMAPIConfig, parse_dynamo_run_args  # noqa: E402
 from common.protocol import (  # noqa: E402
    DynamoTRTLLMChatCompletionRequest,
    DynamoTRTLLMChatCompletionStreamResponse,
@@ -47,21 +45,31 @@ from common.utils import ServerType  # noqa: E402
 logger.set_level(os.getenv("DYN_TRTLLM_LOG_LEVEL", "info"))


-# TODO: support disaggregated as well
+class Processor(ChatProcessorMixin):
+    def __init__(self, engine_config: LLMAPIConfig):
+        super().__init__(engine_config, using_engine_generator=True)
+
+    def preprocess(self, request):
+        return super().preprocess(request)
+
+    def postprocess(self, engine_generator, request, conversation):
+        return super().postprocess(engine_generator, request, conversation)
+
+
 async def chat_generator(engine: BaseTensorrtLLMEngine, request):
    if engine._llm_engine is None:
        raise RuntimeError("Engine not initialized")

    logger.debug(f"Received chat request: {request}")
-    preprocessed_request = await engine.chat_processor.preprocess(request)
+    preprocessed_request = await engine.processor.chat_processor.preprocess(request)
    engine_generator = engine._llm_engine.generate_async(
        inputs=preprocessed_request.prompt,
-        sampling_params=preprocessed_request.to_sampling_params(),
+        sampling_params=get_sampling_params(preprocessed_request.sampling_params),
        disaggregated_params=None,
        streaming=True,
    )
-    async for raw_response in engine.chat_processor.postprocess(
-        engine_generator, request, preprocessed_request.conversation, ServerType.GEN
+    async for raw_response in engine.processor.chat_processor.postprocess(
+        engine_generator, request, preprocessed_request.conversation
    ):
        response = DynamoTRTLLMChatCompletionStreamResponse.model_validate_json(
            raw_response
@@ -74,9 +82,11 @@ class DynamoTRTLLMEngine(BaseTensorrtLLMEngine):
    Request handler for the generate endpoint
    """

-    def __init__(self, trt_llm_engine_config: TensorrtLLMEngineConfig):
-        super().__init__(trt_llm_engine_config)
-        self.chat_processor.using_engine_generator = True
+    def __init__(self, engine_config: LLMAPIConfig):
+        super().__init__(engine_config=engine_config, server_type=ServerType.DYN_RUN)
+        self.processor = Processor(engine_config)
+        # Initialize the engine
+        self._init_engine()


 engine = None  # Global variable to store the engine instance. This is initialized in the main function.
@@ -86,10 +96,7 @@ def init_global_engine(args, engine_config):
    global engine
    logger.debug(f"Received args: {args}")
    logger.info(f"Initializing global engine with engine config: {engine_config}")
-    trt_llm_engine_config = TensorrtLLMEngineConfig(
-        engine_config=engine_config,
-    )
-    engine = DynamoTRTLLMEngine(trt_llm_engine_config)
+    engine = DynamoTRTLLMEngine(engine_config)


 @dynamo_endpoint(

--- a/examples/tensorrt_llm/graphs/agg.py
+++ b/examples/tensorrt_llm/graphs/agg.py
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from components.agg_worker import TensorRTLLMWorker
 from components.frontend import Frontend
 from components.processor import Processor
+from components.worker import TensorRTLLMWorker

 Frontend.link(Processor).link(TensorRTLLMWorker)
--- a/examples/tensorrt_llm/graphs/agg_router.py
+++ b/examples/tensorrt_llm/graphs/agg_router.py
@@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from components.agg_worker import TensorRTLLMWorker
 from components.frontend import Frontend
 from components.kv_router import Router
 from components.processor import Processor
+from components.worker import TensorRTLLMWorker

 Frontend.link(Processor).link(Router).link(TensorRTLLMWorker)
--- a/examples/tensorrt_llm/graphs/disagg.py
+++ b/examples/tensorrt_llm/graphs/disagg.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from components.frontend import Frontend
+from components.prefill_worker import TensorRTLLMPrefillWorker
+from components.processor import Processor
+from components.worker import TensorRTLLMWorker
+
+Frontend.link(Processor).link(TensorRTLLMWorker).link(TensorRTLLMPrefillWorker)
--- a/examples/tensorrt_llm/graphs/disagg_router.py
+++ b/examples/tensorrt_llm/graphs/disagg_router.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from components.frontend import Frontend
+from components.kv_router import Router
+from components.prefill_worker import TensorRTLLMPrefillWorker
+from components.processor import Processor
+from components.worker import TensorRTLLMWorker
+
+Frontend.link(Processor).link(Router).link(TensorRTLLMWorker).link(
+    TensorRTLLMPrefillWorker
+)