Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
cdddaeda
Unverified
Commit
cdddaeda
authored
Jun 06, 2025
by
Tanmay Verma
Committed by
GitHub
Jun 07, 2025
Browse files
test: Add dynamo serve TRTLLM example to pytest (#1417)
parent
4de7f44c
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
241 additions
and
77 deletions
+241
-77
tests/conftest.py
tests/conftest.py
+16
-0
tests/serve/test_dynamo_serve.py
tests/serve/test_dynamo_serve.py
+203
-73
tests/utils/deployment_graph.py
tests/utils/deployment_graph.py
+20
-4
tests/utils/managed_process.py
tests/utils/managed_process.py
+2
-0
No files found.
tests/conftest.py
View file @
cdddaeda
...
...
@@ -32,6 +32,22 @@ logging.basicConfig(
)
def
pytest_collection_modifyitems
(
config
,
items
):
"""
This function is called to modify the list of tests to run.
It is used to skip tests that are not supported on all environments.
"""
# Tests marked with tensorrtllm requires specific environment with tensorrtllm
# installed. Hence, we skip them if the user did not explicitly ask for them.
if
config
.
getoption
(
"-m"
)
and
"tensorrtllm"
in
config
.
getoption
(
"-m"
):
return
skip_tensorrtllm
=
pytest
.
mark
.
skip
(
reason
=
"need -m tensorrtllm to run"
)
for
item
in
items
:
if
"tensorrtllm"
in
item
.
keywords
:
item
.
add_marker
(
skip_tensorrtllm
)
class
EtcdServer
(
ManagedProcess
):
def
__init__
(
self
,
request
,
port
=
2379
,
timeout
=
300
):
port_string
=
str
(
port
)
...
...
tests/serve/test_dynamo_serve.py
View file @
cdddaeda
...
...
@@ -24,6 +24,7 @@ import requests
from
tests.utils.deployment_graph
import
(
DeploymentGraph
,
Payload
,
chat_completions_response_handler
,
completions_response_handler
,
)
from
tests.utils.managed_process
import
ManagedProcess
...
...
@@ -31,7 +32,7 @@ from tests.utils.managed_process import ManagedProcess
text_prompt
=
"Tell me a short joke about AI."
multimodal_payload
=
Payload
(
payload
=
{
payload
_chat
=
{
"model"
:
"llava-hf/llava-1.5-7b-hf"
,
"messages"
:
[
{
...
...
@@ -50,12 +51,13 @@ multimodal_payload = Payload(
"max_tokens"
:
300
,
# Reduced from 500
"stream"
:
False
,
},
repeat_count
=
1
,
expected_log
=
[],
expected_response
=
[
"bus"
],
)
text_payload
=
Payload
(
payload
=
{
payload
_chat
=
{
"model"
:
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
"messages"
:
[
{
...
...
@@ -67,6 +69,14 @@ text_payload = Payload(
"temperature"
:
0.1
,
"seed"
:
0
,
},
payload_completions
=
{
"model"
:
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
"prompt"
:
text_prompt
,
"max_tokens"
:
150
,
"temperature"
:
0.1
,
"seed"
:
0
,
},
repeat_count
=
10
,
expected_log
=
[],
expected_response
=
[
"AI"
],
)
...
...
@@ -77,8 +87,11 @@ deployment_graphs = {
module
=
"graphs.agg:Frontend"
,
config
=
"configs/agg.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoint
=
"v1/chat/completions"
,
response_handler
=
completions_response_handler
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
vllm
],
),
text_payload
,
...
...
@@ -88,8 +101,11 @@ deployment_graphs = {
module
=
"graphs.agg:Frontend"
,
config
=
"configs/agg.yaml"
,
directory
=
"/workspace/examples/sglang"
,
endpoint
=
"v1/chat/completions"
,
response_handler
=
completions_response_handler
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
sglang
],
),
text_payload
,
...
...
@@ -99,8 +115,11 @@ deployment_graphs = {
module
=
"graphs.disagg:Frontend"
,
config
=
"configs/disagg.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoint
=
"v1/chat/completions"
,
response_handler
=
completions_response_handler
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_2
,
pytest
.
mark
.
vllm
],
),
text_payload
,
...
...
@@ -110,8 +129,11 @@ deployment_graphs = {
module
=
"graphs.agg_router:Frontend"
,
config
=
"configs/agg_router.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoint
=
"v1/chat/completions"
,
response_handler
=
completions_response_handler
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
vllm
],
),
text_payload
,
...
...
@@ -121,8 +143,11 @@ deployment_graphs = {
module
=
"graphs.disagg_router:Frontend"
,
config
=
"configs/disagg_router.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoint
=
"v1/chat/completions"
,
response_handler
=
completions_response_handler
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_2
,
pytest
.
mark
.
vllm
],
),
text_payload
,
...
...
@@ -132,8 +157,11 @@ deployment_graphs = {
module
=
"graphs.agg:Frontend"
,
config
=
"configs/agg.yaml"
,
directory
=
"/workspace/examples/multimodal"
,
endpoint
=
"v1/chat/completions"
,
response_handler
=
completions_response_handler
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_2
,
pytest
.
mark
.
vllm
],
),
multimodal_payload
,
...
...
@@ -143,12 +171,79 @@ deployment_graphs = {
module
=
"graphs.agg:Frontend"
,
config
=
"configs/agg.yaml"
,
directory
=
"/workspace/examples/vllm_v1"
,
endpoint
=
"v1/chat/completions"
,
response_handler
=
completions_response_handler
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"trtllm_agg"
:
(
DeploymentGraph
(
module
=
"graphs.agg:Frontend"
,
config
=
"configs/agg.yaml"
,
directory
=
"/workspace/examples/tensorrt_llm"
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
tensorrtllm
],
),
text_payload
,
),
"trtllm_agg_router"
:
(
DeploymentGraph
(
module
=
"graphs.agg_router:Frontend"
,
config
=
"configs/agg_router.yaml"
,
directory
=
"/workspace/examples/tensorrt_llm"
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
tensorrtllm
],
# FIXME: This is a hack to allow deployments to start before sending any requests.
# When using KV-router, if all the endpoints are not registered, the service
# enters a non-recoverable state.
delayed_start
=
60
,
),
text_payload
,
),
"trtllm_disagg"
:
(
DeploymentGraph
(
module
=
"graphs.disagg:Frontend"
,
config
=
"configs/disagg.yaml"
,
directory
=
"/workspace/examples/tensorrt_llm"
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_2
,
pytest
.
mark
.
tensorrtllm
],
),
text_payload
,
),
"trtllm_disagg_router"
:
(
DeploymentGraph
(
module
=
"graphs.disagg_router:Frontend"
,
config
=
"configs/disagg_router.yaml"
,
directory
=
"/workspace/examples/tensorrt_llm"
,
endpoints
=
[
"v1/chat/completions"
,
"v1/completions"
],
response_handlers
=
[
chat_completions_response_handler
,
completions_response_handler
,
],
marks
=
[
pytest
.
mark
.
gpu_2
,
pytest
.
mark
.
tensorrtllm
],
# FIXME: This is a hack to allow deployments to start before sending any requests.
# When using KV-router, if all the endpoints are not registered, the service
# enters a non-recoverable state.
delayed_start
=
120
,
),
text_payload
,
),
}
...
...
@@ -175,6 +270,7 @@ class DynamoServeProcess(ManagedProcess):
working_dir
=
graph
.
directory
,
health_check_ports
=
[
port
],
health_check_urls
=
health_check_urls
,
delayed_start
=
graph
.
delayed_start
,
stragglers
=
[
"http"
],
log_dir
=
request
.
node
.
name
,
)
...
...
@@ -196,6 +292,16 @@ class DynamoServeProcess(ManagedProcess):
pytest
.
param
(
"disagg"
,
marks
=
[
pytest
.
mark
.
vllm
,
pytest
.
mark
.
gpu_2
]),
pytest
.
param
(
"disagg_router"
,
marks
=
[
pytest
.
mark
.
vllm
,
pytest
.
mark
.
gpu_2
]),
pytest
.
param
(
"multimodal_agg"
,
marks
=
[
pytest
.
mark
.
vllm
,
pytest
.
mark
.
gpu_2
]),
pytest
.
param
(
"trtllm_agg"
,
marks
=
[
pytest
.
mark
.
tensorrtllm
,
pytest
.
mark
.
gpu_1
]),
pytest
.
param
(
"trtllm_agg_router"
,
marks
=
[
pytest
.
mark
.
tensorrtllm
,
pytest
.
mark
.
gpu_1
]
),
pytest
.
param
(
"trtllm_disagg"
,
marks
=
[
pytest
.
mark
.
tensorrtllm
,
pytest
.
mark
.
gpu_2
]
),
pytest
.
param
(
"trtllm_disagg_router"
,
marks
=
[
pytest
.
mark
.
tensorrtllm
,
pytest
.
mark
.
gpu_2
]
),
# pytest.param("sglang", marks=[pytest.mark.sglang, pytest.mark.gpu_2]),
]
)
...
...
@@ -220,17 +326,40 @@ def test_serve_deployment(deployment_graph_test, request, runtime_services):
deployment_graph
,
payload
=
deployment_graph_test
def
check_response
(
response
,
response_handler
):
assert
response
.
status_code
==
200
,
"Server is not healthy"
content
=
response_handler
(
response
)
logger
.
info
(
"Received Content: %s"
,
content
)
# Check for expected responses
assert
content
,
"Empty response content"
for
expected
in
payload
.
expected_response
:
assert
expected
in
content
,
"Expected '%s' not found in response"
%
expected
with
DynamoServeProcess
(
deployment_graph
,
request
)
as
server_process
:
url
=
f
"http://localhost:
{
server_process
.
port
}
/
{
deployment_graph
.
endpoint
}
"
first_success_pending
=
True
for
endpoint
,
response_handler
in
zip
(
deployment_graph
.
endpoints
,
deployment_graph
.
response_handlers
):
url
=
f
"http://localhost:
{
server_process
.
port
}
/
{
endpoint
}
"
start_time
=
time
.
time
()
retry_delay
=
5
elapsed
=
0.0
while
time
.
time
()
-
start_time
<
deployment_graph
.
timeout
:
request_body
=
(
payload
.
payload_chat
if
endpoint
==
"v1/chat/completions"
else
payload
.
payload_completions
)
# We can skip this
while
(
time
.
time
()
-
start_time
<
deployment_graph
.
timeout
and
first_success_pending
):
elapsed
=
time
.
time
()
-
start_time
try
:
response
=
requests
.
post
(
url
,
json
=
payload
.
payload
,
json
=
request_body
,
timeout
=
deployment_graph
.
timeout
-
elapsed
,
)
except
(
requests
.
RequestException
,
requests
.
Timeout
)
as
e
:
...
...
@@ -262,8 +391,11 @@ def test_serve_deployment(deployment_graph_test, request, runtime_services):
%
(
response
.
status_code
,
response
.
text
)
)
else
:
check_response
(
response
,
response_handler
)
first_success_pending
=
False
break
else
:
if
first_success_pending
:
logger
.
error
(
"Service did not return a successful response within %s s"
,
deployment_graph
.
timeout
,
...
...
@@ -273,12 +405,10 @@ def test_serve_deployment(deployment_graph_test, request, runtime_services):
%
deployment_graph
.
timeout
)
content
=
deployment_graph
.
response_handler
(
response
)
logger
.
info
(
"Received Content: %s"
,
content
)
# Check for expected responses
assert
content
,
"Empty response content"
for
expected
in
payload
.
expected_response
:
assert
expected
in
content
,
"Expected '%s' not found in response"
%
expected
for
_
in
range
(
payload
.
repeat_count
):
response
=
requests
.
post
(
url
,
json
=
request_body
,
timeout
=
deployment_graph
.
timeout
-
elapsed
,
)
check_response
(
response
,
response_handler
)
tests/utils/deployment_graph.py
View file @
cdddaeda
...
...
@@ -26,9 +26,10 @@ class DeploymentGraph:
module
:
str
config
:
str
directory
:
str
endpoint
:
str
response_handler
:
Callable
[[
Any
],
str
]
endpoint
s
:
List
[
str
]
response_handler
s
:
List
[
Callable
[[
Any
],
str
]
]
timeout
:
int
=
900
delayed_start
:
int
=
0
marks
:
Optional
[
List
[
Any
]]
=
field
(
default_factory
=
list
)
...
...
@@ -38,12 +39,14 @@ class Payload:
Represents a test payload with expected response and log patterns.
"""
payload
:
Dict
[
str
,
Any
]
payload
_chat
:
Dict
[
str
,
Any
]
expected_response
:
List
[
str
]
expected_log
:
List
[
str
]
repeat_count
:
int
=
1
payload_completions
:
Optional
[
Dict
[
str
,
Any
]]
=
None
def
completions_response_handler
(
response
):
def
chat_
completions_response_handler
(
response
):
"""
Process chat completions API responses.
"""
...
...
@@ -55,3 +58,16 @@ def completions_response_handler(response):
assert
"message"
in
result
[
"choices"
][
0
],
"Missing 'message' in first choice"
assert
"content"
in
result
[
"choices"
][
0
][
"message"
],
"Missing 'content' in message"
return
result
[
"choices"
][
0
][
"message"
][
"content"
]
def
completions_response_handler
(
response
):
"""
Process completions API responses.
"""
if
response
.
status_code
!=
200
:
return
""
result
=
response
.
json
()
assert
"choices"
in
result
,
"Missing 'choices' in response"
assert
len
(
result
[
"choices"
])
>
0
,
"Empty choices in response"
assert
"text"
in
result
[
"choices"
][
0
],
"Missing 'text' in first choice"
return
result
[
"choices"
][
0
][
"text"
]
tests/utils/managed_process.py
View file @
cdddaeda
...
...
@@ -32,6 +32,7 @@ class ManagedProcess:
env
:
Optional
[
dict
]
=
None
health_check_ports
:
List
[
int
]
=
field
(
default_factory
=
list
)
health_check_urls
:
List
[
Any
]
=
field
(
default_factory
=
list
)
delayed_start
:
int
=
0
timeout
:
int
=
300
working_dir
:
Optional
[
str
]
=
None
display_output
:
bool
=
False
...
...
@@ -59,6 +60,7 @@ class ManagedProcess:
self
.
_terminate_existing
()
self
.
_start_process
()
time
.
sleep
(
self
.
delayed_start
)
elapsed
=
self
.
_check_ports
(
self
.
timeout
)
self
.
_check_urls
(
self
.
timeout
-
elapsed
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment