Unverified Commit 960ee927 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

feat: Update to support completion endpoint in TRTLLM (#837)

parent f0ac8e2b
......@@ -131,18 +131,12 @@ cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
```
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers.
#### Disaggregated serving with KV Routing
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
```
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers.
#### Multi-Node Disaggregated Serving
In the following example, we will demonstrate how to run a Disaggregated Serving
......
......@@ -19,7 +19,6 @@ from typing import Any, Dict, List, Union
from common.parser import LLMAPIConfig
from common.protocol import (
DisaggregatedTypeConverter,
DynamoTRTLLMChatCompletionResponseStreamChoice,
DynamoTRTLLMChatCompletionStreamResponse,
DynamoTRTLLMCompletionResponseStreamChoice,
......@@ -190,7 +189,7 @@ class ChatProcessor(BaseChatProcessor):
)
if response.outputs[0].disaggregated_params is not None:
# Do not include the disaggregated params in response
# from Processor.
# from processor.
pass
chunk = DynamoTRTLLMChatCompletionStreamResponse(
......@@ -403,11 +402,9 @@ class CompletionsProcessor:
finish_reason=output.finish_reason,
)
if output.disaggregated_params is not None:
choice.disaggregated_params = (
DisaggregatedTypeConverter.to_oai_disaggregated_params(
output.disaggregated_params
)
)
# Block the disagg_params
pass
chunk = DynamoTRTLLMCompletionStreamResponse(
model=self.model,
choices=[choice],
......@@ -429,6 +426,7 @@ class CompletionsProcessor:
return TRTLLMWorkerRequest(
id=request.id,
model=request.model,
prompt=prompt,
sampling_params=asdict(sampling_params),
disaggregated_params=request.disaggregated_params,
......
......@@ -41,7 +41,8 @@ def get_http_binary_path():
class FrontendConfig(BaseModel):
served_model_name: str
endpoint: str
endpoint_chat: str
endpoint_completions: str
port: int = 8080
......@@ -64,6 +65,7 @@ class Frontend:
config = ServiceConfig.get_instance()
frontend_config = FrontendConfig(**config.get("Frontend", {}))
# Chat/completions Endpoint
subprocess.run(
[
"llmctl",
......@@ -80,7 +82,28 @@ class Frontend:
"add",
"chat-models",
frontend_config.served_model_name,
frontend_config.endpoint,
frontend_config.endpoint_chat,
]
)
# Completions Endpoint
subprocess.run(
[
"llmctl",
"http",
"remove",
"completions",
frontend_config.served_model_name,
]
)
subprocess.run(
[
"llmctl",
"http",
"add",
"completions",
frontend_config.served_model_name,
frontend_config.endpoint_completions,
]
)
......
......@@ -19,7 +19,10 @@ import logging
from common.chat_processor import ChatProcessorMixin
from common.parser import parse_tensorrt_llm_args
from common.protocol import DynamoTRTLLMChatCompletionRequest
from common.protocol import (
DynamoTRTLLMChatCompletionRequest,
DynamoTRTLLMCompletionRequest,
)
from common.utils import RequestType
from components.kv_router import Router
from components.worker import TensorRTLLMWorker
......@@ -156,7 +159,7 @@ class Processor(ChatProcessorMixin):
async for response in self._generate(raw_request, RequestType.CHAT):
yield response
# @dynamo_endpoint()
# async def completions(self, raw_request):
# async for response in self._generate(raw_request, RequestType.COMPLETION):
# yield response
@dynamo_endpoint(name="completions")
async def completions(self, raw_request: DynamoTRTLLMCompletionRequest):
async for response in self._generate(raw_request, RequestType.COMPLETION):
yield response
......@@ -15,7 +15,8 @@
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
endpoint_completions: dynamo.Processor.completions
endpoint_chat: dynamo.Processor.chat/completions
port: 8000
Processor:
......
......@@ -15,7 +15,8 @@
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
endpoint_completions: dynamo.Processor.completions
endpoint_chat: dynamo.Processor.chat/completions
port: 8000
Processor:
......
......@@ -15,7 +15,8 @@
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
endpoint_completions: dynamo.Processor.completions
endpoint_chat: dynamo.Processor.chat/completions
port: 8000
Processor:
......
......@@ -15,7 +15,8 @@
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
endpoint_completions: dynamo.Processor.completions
endpoint_chat: dynamo.Processor.chat/completions
port: 8000
Processor:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment