feat: Update to support completion endpoint in TRTLLM (#837)

960ee927 · Tanmay Verma · GitHub · f0ac8e2b · 960ee927 · 960ee927
Unverified Commit 960ee927 authored May 02, 2025 by Tanmay Verma Committed by GitHub May 02, 2025
8 changed files
--- a/examples/tensorrt_llm/README.md
+++ b/examples/tensorrt_llm/README.md
@@ -131,18 +131,12 @@ cd /workspace/examples/tensorrt_llm
 dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
 ```

-We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
-cache between the context and generation workers.
-
 #### Disaggregated serving with KV Routing
 ```bash
 cd /workspace/examples/tensorrt_llm
 dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
 ```

-We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
-cache between the context and generation workers.
-
 #### Multi-Node Disaggregated Serving

 In the following example, we will demonstrate how to run a Disaggregated Serving

--- a/examples/tensorrt_llm/common/chat_processor.py
+++ b/examples/tensorrt_llm/common/chat_processor.py
@@ -19,7 +19,6 @@ from typing import Any, Dict, List, Union

 from common.parser import LLMAPIConfig
 from common.protocol import (
-    DisaggregatedTypeConverter,
    DynamoTRTLLMChatCompletionResponseStreamChoice,
    DynamoTRTLLMChatCompletionStreamResponse,
    DynamoTRTLLMCompletionResponseStreamChoice,
@@ -190,7 +189,7 @@ class ChatProcessor(BaseChatProcessor):
            )
            if response.outputs[0].disaggregated_params is not None:
                # Do not include the disaggregated params in response
-                # from Processor.
+                # from processor.
                pass

            chunk = DynamoTRTLLMChatCompletionStreamResponse(
@@ -403,11 +402,9 @@ class CompletionsProcessor:
                finish_reason=output.finish_reason,
            )
            if output.disaggregated_params is not None:
-                choice.disaggregated_params = (
-                    DisaggregatedTypeConverter.to_oai_disaggregated_params(
-                        output.disaggregated_params
-                    )
-                )
+                # Block the disagg_params
+                pass
+
            chunk = DynamoTRTLLMCompletionStreamResponse(
                model=self.model,
                choices=[choice],
@@ -429,6 +426,7 @@ class CompletionsProcessor:

        return TRTLLMWorkerRequest(
            id=request.id,
+            model=request.model,
            prompt=prompt,
            sampling_params=asdict(sampling_params),
            disaggregated_params=request.disaggregated_params,

--- a/examples/tensorrt_llm/components/frontend.py
+++ b/examples/tensorrt_llm/components/frontend.py
@@ -41,7 +41,8 @@ def get_http_binary_path():

 class FrontendConfig(BaseModel):
    served_model_name: str
-    endpoint: str
+    endpoint_chat: str
+    endpoint_completions: str
    port: int = 8080


@@ -64,6 +65,7 @@ class Frontend:
        config = ServiceConfig.get_instance()
        frontend_config = FrontendConfig(**config.get("Frontend", {}))

+        # Chat/completions Endpoint
        subprocess.run(
            [
                "llmctl",
@@ -80,7 +82,28 @@ class Frontend:
                "add",
                "chat-models",
                frontend_config.served_model_name,
-                frontend_config.endpoint,
+                frontend_config.endpoint_chat,
+            ]
+        )
+
+        # Completions Endpoint
+        subprocess.run(
+            [
+                "llmctl",
+                "http",
+                "remove",
+                "completions",
+                frontend_config.served_model_name,
+            ]
+        )
+        subprocess.run(
+            [
+                "llmctl",
+                "http",
+                "add",
+                "completions",
+                frontend_config.served_model_name,
+                frontend_config.endpoint_completions,
            ]
        )


--- a/examples/tensorrt_llm/components/processor.py
+++ b/examples/tensorrt_llm/components/processor.py
@@ -19,7 +19,10 @@ import logging

 from common.chat_processor import ChatProcessorMixin
 from common.parser import parse_tensorrt_llm_args
-from common.protocol import DynamoTRTLLMChatCompletionRequest
+from common.protocol import (
+    DynamoTRTLLMChatCompletionRequest,
+    DynamoTRTLLMCompletionRequest,
+)
 from common.utils import RequestType
 from components.kv_router import Router
 from components.worker import TensorRTLLMWorker
@@ -156,7 +159,7 @@ class Processor(ChatProcessorMixin):
        async for response in self._generate(raw_request, RequestType.CHAT):
            yield response

-    # @dynamo_endpoint()
-    # async def completions(self, raw_request):
-    #     async for response in self._generate(raw_request, RequestType.COMPLETION):
-    #         yield response
+    @dynamo_endpoint(name="completions")
+    async def completions(self, raw_request: DynamoTRTLLMCompletionRequest):
+        async for response in self._generate(raw_request, RequestType.COMPLETION):
+            yield response
--- a/examples/tensorrt_llm/configs/agg.yaml
+++ b/examples/tensorrt_llm/configs/agg.yaml
@@ -15,7 +15,8 @@

 Frontend:
  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  endpoint: dynamo.Processor.chat/completions
+  endpoint_completions: dynamo.Processor.completions
+  endpoint_chat: dynamo.Processor.chat/completions
  port: 8000

 Processor:

--- a/examples/tensorrt_llm/configs/agg_router.yaml
+++ b/examples/tensorrt_llm/configs/agg_router.yaml
@@ -15,7 +15,8 @@

 Frontend:
  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  endpoint: dynamo.Processor.chat/completions
+  endpoint_completions: dynamo.Processor.completions
+  endpoint_chat: dynamo.Processor.chat/completions
  port: 8000

 Processor:

--- a/examples/tensorrt_llm/configs/disagg.yaml
+++ b/examples/tensorrt_llm/configs/disagg.yaml
@@ -15,7 +15,8 @@

 Frontend:
  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  endpoint: dynamo.Processor.chat/completions
+  endpoint_completions: dynamo.Processor.completions
+  endpoint_chat: dynamo.Processor.chat/completions
  port: 8000

 Processor:

--- a/examples/tensorrt_llm/configs/disagg_router.yaml
+++ b/examples/tensorrt_llm/configs/disagg_router.yaml
@@ -15,7 +15,8 @@

 Frontend:
  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  endpoint: dynamo.Processor.chat/completions
+  endpoint_completions: dynamo.Processor.completions
+  endpoint_chat: dynamo.Processor.chat/completions
  port: 8000

 Processor: