"vscode:/vscode.git/clone" did not exist on "c871ccf3398ca7719c51db0819a831ec1063607e"
Unverified Commit 960ee927 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

feat: Update to support completion endpoint in TRTLLM (#837)

parent f0ac8e2b
...@@ -131,18 +131,12 @@ cd /workspace/examples/tensorrt_llm ...@@ -131,18 +131,12 @@ cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
``` ```
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers.
#### Disaggregated serving with KV Routing #### Disaggregated serving with KV Routing
```bash ```bash
cd /workspace/examples/tensorrt_llm cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
``` ```
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers.
#### Multi-Node Disaggregated Serving #### Multi-Node Disaggregated Serving
In the following example, we will demonstrate how to run a Disaggregated Serving In the following example, we will demonstrate how to run a Disaggregated Serving
......
...@@ -19,7 +19,6 @@ from typing import Any, Dict, List, Union ...@@ -19,7 +19,6 @@ from typing import Any, Dict, List, Union
from common.parser import LLMAPIConfig from common.parser import LLMAPIConfig
from common.protocol import ( from common.protocol import (
DisaggregatedTypeConverter,
DynamoTRTLLMChatCompletionResponseStreamChoice, DynamoTRTLLMChatCompletionResponseStreamChoice,
DynamoTRTLLMChatCompletionStreamResponse, DynamoTRTLLMChatCompletionStreamResponse,
DynamoTRTLLMCompletionResponseStreamChoice, DynamoTRTLLMCompletionResponseStreamChoice,
...@@ -190,7 +189,7 @@ class ChatProcessor(BaseChatProcessor): ...@@ -190,7 +189,7 @@ class ChatProcessor(BaseChatProcessor):
) )
if response.outputs[0].disaggregated_params is not None: if response.outputs[0].disaggregated_params is not None:
# Do not include the disaggregated params in response # Do not include the disaggregated params in response
# from Processor. # from processor.
pass pass
chunk = DynamoTRTLLMChatCompletionStreamResponse( chunk = DynamoTRTLLMChatCompletionStreamResponse(
...@@ -403,11 +402,9 @@ class CompletionsProcessor: ...@@ -403,11 +402,9 @@ class CompletionsProcessor:
finish_reason=output.finish_reason, finish_reason=output.finish_reason,
) )
if output.disaggregated_params is not None: if output.disaggregated_params is not None:
choice.disaggregated_params = ( # Block the disagg_params
DisaggregatedTypeConverter.to_oai_disaggregated_params( pass
output.disaggregated_params
)
)
chunk = DynamoTRTLLMCompletionStreamResponse( chunk = DynamoTRTLLMCompletionStreamResponse(
model=self.model, model=self.model,
choices=[choice], choices=[choice],
...@@ -429,6 +426,7 @@ class CompletionsProcessor: ...@@ -429,6 +426,7 @@ class CompletionsProcessor:
return TRTLLMWorkerRequest( return TRTLLMWorkerRequest(
id=request.id, id=request.id,
model=request.model,
prompt=prompt, prompt=prompt,
sampling_params=asdict(sampling_params), sampling_params=asdict(sampling_params),
disaggregated_params=request.disaggregated_params, disaggregated_params=request.disaggregated_params,
......
...@@ -41,7 +41,8 @@ def get_http_binary_path(): ...@@ -41,7 +41,8 @@ def get_http_binary_path():
class FrontendConfig(BaseModel): class FrontendConfig(BaseModel):
served_model_name: str served_model_name: str
endpoint: str endpoint_chat: str
endpoint_completions: str
port: int = 8080 port: int = 8080
...@@ -64,6 +65,7 @@ class Frontend: ...@@ -64,6 +65,7 @@ class Frontend:
config = ServiceConfig.get_instance() config = ServiceConfig.get_instance()
frontend_config = FrontendConfig(**config.get("Frontend", {})) frontend_config = FrontendConfig(**config.get("Frontend", {}))
# Chat/completions Endpoint
subprocess.run( subprocess.run(
[ [
"llmctl", "llmctl",
...@@ -80,7 +82,28 @@ class Frontend: ...@@ -80,7 +82,28 @@ class Frontend:
"add", "add",
"chat-models", "chat-models",
frontend_config.served_model_name, frontend_config.served_model_name,
frontend_config.endpoint, frontend_config.endpoint_chat,
]
)
# Completions Endpoint
subprocess.run(
[
"llmctl",
"http",
"remove",
"completions",
frontend_config.served_model_name,
]
)
subprocess.run(
[
"llmctl",
"http",
"add",
"completions",
frontend_config.served_model_name,
frontend_config.endpoint_completions,
] ]
) )
......
...@@ -19,7 +19,10 @@ import logging ...@@ -19,7 +19,10 @@ import logging
from common.chat_processor import ChatProcessorMixin from common.chat_processor import ChatProcessorMixin
from common.parser import parse_tensorrt_llm_args from common.parser import parse_tensorrt_llm_args
from common.protocol import DynamoTRTLLMChatCompletionRequest from common.protocol import (
DynamoTRTLLMChatCompletionRequest,
DynamoTRTLLMCompletionRequest,
)
from common.utils import RequestType from common.utils import RequestType
from components.kv_router import Router from components.kv_router import Router
from components.worker import TensorRTLLMWorker from components.worker import TensorRTLLMWorker
...@@ -156,7 +159,7 @@ class Processor(ChatProcessorMixin): ...@@ -156,7 +159,7 @@ class Processor(ChatProcessorMixin):
async for response in self._generate(raw_request, RequestType.CHAT): async for response in self._generate(raw_request, RequestType.CHAT):
yield response yield response
# @dynamo_endpoint() @dynamo_endpoint(name="completions")
# async def completions(self, raw_request): async def completions(self, raw_request: DynamoTRTLLMCompletionRequest):
# async for response in self._generate(raw_request, RequestType.COMPLETION): async for response in self._generate(raw_request, RequestType.COMPLETION):
# yield response yield response
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
Frontend: Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions endpoint_completions: dynamo.Processor.completions
endpoint_chat: dynamo.Processor.chat/completions
port: 8000 port: 8000
Processor: Processor:
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
Frontend: Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions endpoint_completions: dynamo.Processor.completions
endpoint_chat: dynamo.Processor.chat/completions
port: 8000 port: 8000
Processor: Processor:
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
Frontend: Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions endpoint_completions: dynamo.Processor.completions
endpoint_chat: dynamo.Processor.chat/completions
port: 8000 port: 8000
Processor: Processor:
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
Frontend: Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions endpoint_completions: dynamo.Processor.completions
endpoint_chat: dynamo.Processor.chat/completions
port: 8000 port: 8000
Processor: Processor:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment