Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
960ee927
Unverified
Commit
960ee927
authored
May 02, 2025
by
Tanmay Verma
Committed by
GitHub
May 02, 2025
Browse files
feat: Update to support completion endpoint in TRTLLM (#837)
parent
f0ac8e2b
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
46 additions
and
24 deletions
+46
-24
examples/tensorrt_llm/README.md
examples/tensorrt_llm/README.md
+0
-6
examples/tensorrt_llm/common/chat_processor.py
examples/tensorrt_llm/common/chat_processor.py
+5
-7
examples/tensorrt_llm/components/frontend.py
examples/tensorrt_llm/components/frontend.py
+25
-2
examples/tensorrt_llm/components/processor.py
examples/tensorrt_llm/components/processor.py
+8
-5
examples/tensorrt_llm/configs/agg.yaml
examples/tensorrt_llm/configs/agg.yaml
+2
-1
examples/tensorrt_llm/configs/agg_router.yaml
examples/tensorrt_llm/configs/agg_router.yaml
+2
-1
examples/tensorrt_llm/configs/disagg.yaml
examples/tensorrt_llm/configs/disagg.yaml
+2
-1
examples/tensorrt_llm/configs/disagg_router.yaml
examples/tensorrt_llm/configs/disagg_router.yaml
+2
-1
No files found.
examples/tensorrt_llm/README.md
View file @
960ee927
...
@@ -131,18 +131,12 @@ cd /workspace/examples/tensorrt_llm
...
@@ -131,18 +131,12 @@ cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg:Frontend
-f
./configs/disagg.yaml
dynamo serve graphs.disagg:Frontend
-f
./configs/disagg.yaml
```
```
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers.
#### Disaggregated serving with KV Routing
#### Disaggregated serving with KV Routing
```
bash
```
bash
cd
/workspace/examples/tensorrt_llm
cd
/workspace/examples/tensorrt_llm
dynamo serve graphs.disagg_router:Frontend
-f
./configs/disagg_router.yaml
dynamo serve graphs.disagg_router:Frontend
-f
./configs/disagg_router.yaml
```
```
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers.
#### Multi-Node Disaggregated Serving
#### Multi-Node Disaggregated Serving
In the following example, we will demonstrate how to run a Disaggregated Serving
In the following example, we will demonstrate how to run a Disaggregated Serving
...
...
examples/tensorrt_llm/common/chat_processor.py
View file @
960ee927
...
@@ -19,7 +19,6 @@ from typing import Any, Dict, List, Union
...
@@ -19,7 +19,6 @@ from typing import Any, Dict, List, Union
from
common.parser
import
LLMAPIConfig
from
common.parser
import
LLMAPIConfig
from
common.protocol
import
(
from
common.protocol
import
(
DisaggregatedTypeConverter
,
DynamoTRTLLMChatCompletionResponseStreamChoice
,
DynamoTRTLLMChatCompletionResponseStreamChoice
,
DynamoTRTLLMChatCompletionStreamResponse
,
DynamoTRTLLMChatCompletionStreamResponse
,
DynamoTRTLLMCompletionResponseStreamChoice
,
DynamoTRTLLMCompletionResponseStreamChoice
,
...
@@ -190,7 +189,7 @@ class ChatProcessor(BaseChatProcessor):
...
@@ -190,7 +189,7 @@ class ChatProcessor(BaseChatProcessor):
)
)
if
response
.
outputs
[
0
].
disaggregated_params
is
not
None
:
if
response
.
outputs
[
0
].
disaggregated_params
is
not
None
:
# Do not include the disaggregated params in response
# Do not include the disaggregated params in response
# from
P
rocessor.
# from
p
rocessor.
pass
pass
chunk
=
DynamoTRTLLMChatCompletionStreamResponse
(
chunk
=
DynamoTRTLLMChatCompletionStreamResponse
(
...
@@ -403,11 +402,9 @@ class CompletionsProcessor:
...
@@ -403,11 +402,9 @@ class CompletionsProcessor:
finish_reason
=
output
.
finish_reason
,
finish_reason
=
output
.
finish_reason
,
)
)
if
output
.
disaggregated_params
is
not
None
:
if
output
.
disaggregated_params
is
not
None
:
choice
.
disaggregated_params
=
(
# Block the disagg_params
DisaggregatedTypeConverter
.
to_oai_disaggregated_params
(
pass
output
.
disaggregated_params
)
)
chunk
=
DynamoTRTLLMCompletionStreamResponse
(
chunk
=
DynamoTRTLLMCompletionStreamResponse
(
model
=
self
.
model
,
model
=
self
.
model
,
choices
=
[
choice
],
choices
=
[
choice
],
...
@@ -429,6 +426,7 @@ class CompletionsProcessor:
...
@@ -429,6 +426,7 @@ class CompletionsProcessor:
return
TRTLLMWorkerRequest
(
return
TRTLLMWorkerRequest
(
id
=
request
.
id
,
id
=
request
.
id
,
model
=
request
.
model
,
prompt
=
prompt
,
prompt
=
prompt
,
sampling_params
=
asdict
(
sampling_params
),
sampling_params
=
asdict
(
sampling_params
),
disaggregated_params
=
request
.
disaggregated_params
,
disaggregated_params
=
request
.
disaggregated_params
,
...
...
examples/tensorrt_llm/components/frontend.py
View file @
960ee927
...
@@ -41,7 +41,8 @@ def get_http_binary_path():
...
@@ -41,7 +41,8 @@ def get_http_binary_path():
class
FrontendConfig
(
BaseModel
):
class
FrontendConfig
(
BaseModel
):
served_model_name
:
str
served_model_name
:
str
endpoint
:
str
endpoint_chat
:
str
endpoint_completions
:
str
port
:
int
=
8080
port
:
int
=
8080
...
@@ -64,6 +65,7 @@ class Frontend:
...
@@ -64,6 +65,7 @@ class Frontend:
config
=
ServiceConfig
.
get_instance
()
config
=
ServiceConfig
.
get_instance
()
frontend_config
=
FrontendConfig
(
**
config
.
get
(
"Frontend"
,
{}))
frontend_config
=
FrontendConfig
(
**
config
.
get
(
"Frontend"
,
{}))
# Chat/completions Endpoint
subprocess
.
run
(
subprocess
.
run
(
[
[
"llmctl"
,
"llmctl"
,
...
@@ -80,7 +82,28 @@ class Frontend:
...
@@ -80,7 +82,28 @@ class Frontend:
"add"
,
"add"
,
"chat-models"
,
"chat-models"
,
frontend_config
.
served_model_name
,
frontend_config
.
served_model_name
,
frontend_config
.
endpoint
,
frontend_config
.
endpoint_chat
,
]
)
# Completions Endpoint
subprocess
.
run
(
[
"llmctl"
,
"http"
,
"remove"
,
"completions"
,
frontend_config
.
served_model_name
,
]
)
subprocess
.
run
(
[
"llmctl"
,
"http"
,
"add"
,
"completions"
,
frontend_config
.
served_model_name
,
frontend_config
.
endpoint_completions
,
]
]
)
)
...
...
examples/tensorrt_llm/components/processor.py
View file @
960ee927
...
@@ -19,7 +19,10 @@ import logging
...
@@ -19,7 +19,10 @@ import logging
from
common.chat_processor
import
ChatProcessorMixin
from
common.chat_processor
import
ChatProcessorMixin
from
common.parser
import
parse_tensorrt_llm_args
from
common.parser
import
parse_tensorrt_llm_args
from
common.protocol
import
DynamoTRTLLMChatCompletionRequest
from
common.protocol
import
(
DynamoTRTLLMChatCompletionRequest
,
DynamoTRTLLMCompletionRequest
,
)
from
common.utils
import
RequestType
from
common.utils
import
RequestType
from
components.kv_router
import
Router
from
components.kv_router
import
Router
from
components.worker
import
TensorRTLLMWorker
from
components.worker
import
TensorRTLLMWorker
...
@@ -156,7 +159,7 @@ class Processor(ChatProcessorMixin):
...
@@ -156,7 +159,7 @@ class Processor(ChatProcessorMixin):
async
for
response
in
self
.
_generate
(
raw_request
,
RequestType
.
CHAT
):
async
for
response
in
self
.
_generate
(
raw_request
,
RequestType
.
CHAT
):
yield
response
yield
response
#
@dynamo_endpoint()
@
dynamo_endpoint
(
name
=
"completions"
)
#
async def completions(self, raw_request):
async
def
completions
(
self
,
raw_request
:
DynamoTRTLLMCompletionRequest
):
#
async for response in self._generate(raw_request, RequestType.COMPLETION):
async
for
response
in
self
.
_generate
(
raw_request
,
RequestType
.
COMPLETION
):
#
yield response
yield
response
examples/tensorrt_llm/configs/agg.yaml
View file @
960ee927
...
@@ -15,7 +15,8 @@
...
@@ -15,7 +15,8 @@
Frontend
:
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
endpoint_completions
:
dynamo.Processor.completions
endpoint_chat
:
dynamo.Processor.chat/completions
port
:
8000
port
:
8000
Processor
:
Processor
:
...
...
examples/tensorrt_llm/configs/agg_router.yaml
View file @
960ee927
...
@@ -15,7 +15,8 @@
...
@@ -15,7 +15,8 @@
Frontend
:
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
endpoint_completions
:
dynamo.Processor.completions
endpoint_chat
:
dynamo.Processor.chat/completions
port
:
8000
port
:
8000
Processor
:
Processor
:
...
...
examples/tensorrt_llm/configs/disagg.yaml
View file @
960ee927
...
@@ -15,7 +15,8 @@
...
@@ -15,7 +15,8 @@
Frontend
:
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
endpoint_completions
:
dynamo.Processor.completions
endpoint_chat
:
dynamo.Processor.chat/completions
port
:
8000
port
:
8000
Processor
:
Processor
:
...
...
examples/tensorrt_llm/configs/disagg_router.yaml
View file @
960ee927
...
@@ -15,7 +15,8 @@
...
@@ -15,7 +15,8 @@
Frontend
:
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
endpoint_completions
:
dynamo.Processor.completions
endpoint_chat
:
dynamo.Processor.chat/completions
port
:
8000
port
:
8000
Processor
:
Processor
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment