Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
316b1bf7
Unverified
Commit
316b1bf7
authored
Jul 23, 2025
by
Nick Hill
Committed by
GitHub
Jul 23, 2025
Browse files
[Tests] Add tests for headless internal DP LB (#21450)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
7c734ee0
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
768 additions
and
120 deletions
+768
-120
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-0
tests/v1/entrypoints/openai/test_multi_api_servers.py
tests/v1/entrypoints/openai/test_multi_api_servers.py
+3
-120
tests/v1/test_internal_lb_dp.py
tests/v1/test_internal_lb_dp.py
+639
-0
tests/v1/test_utils.py
tests/v1/test_utils.py
+124
-0
No files found.
.buildkite/test-pipeline.yaml
View file @
316b1bf7
...
@@ -165,6 +165,7 @@ steps:
...
@@ -165,6 +165,7 @@ steps:
-
tests/examples/offline_inference/data_parallel.py
-
tests/examples/offline_inference/data_parallel.py
-
tests/v1/test_async_llm_dp.py
-
tests/v1/test_async_llm_dp.py
-
tests/v1/test_external_lb_dp.py
-
tests/v1/test_external_lb_dp.py
-
tests/v1/test_internal_lb_dp.py
-
tests/v1/engine/test_engine_core_client.py
-
tests/v1/engine/test_engine_core_client.py
commands
:
commands
:
# test with tp=2 and external_dp=2
# test with tp=2 and external_dp=2
...
@@ -176,6 +177,7 @@ steps:
...
@@ -176,6 +177,7 @@ steps:
-
python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-
python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
-
TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
-
pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-
pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-
pytest -v -s distributed/test_utils.py
-
pytest -v -s distributed/test_utils.py
-
pytest -v -s compile/test_basic_correctness.py
-
pytest -v -s compile/test_basic_correctness.py
...
...
tests/v1/entrypoints/openai/test_multi_api_servers.py
View file @
316b1bf7
...
@@ -2,136 +2,19 @@
...
@@ -2,136 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
asyncio
import
os
import
os
import
re
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
import
requests
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
tests.v1.test_utils
import
check_request_balancing
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
DP_SIZE
=
os
.
getenv
(
"DP_SIZE"
,
"1"
)
DP_SIZE
=
os
.
getenv
(
"DP_SIZE"
,
"1"
)
def
get_prometheus_metrics
(
server
:
RemoteOpenAIServer
)
->
dict
[
str
,
dict
[
str
,
float
]]:
"""Fetch and parse Prometheus metrics from the /metrics endpoint.
Returns:
Dict mapping metric names to their values grouped by labels.
For example: {"vllm:request_success": {
"engine=0": 5.0, "engine=1": 3.0}
}
"""
try
:
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
),
timeout
=
10
)
response
.
raise_for_status
()
metrics
:
dict
[
str
,
dict
[
str
,
float
]]
=
{}
# Regex patterns for Prometheus metrics
metric_with_labels
=
re
.
compile
(
r
'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$'
)
metric_simple
=
re
.
compile
(
r
'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$'
)
for
line
in
response
.
text
.
split
(
'
\n
'
):
line
=
line
.
strip
()
# Skip comments and empty lines
if
not
line
or
line
.
startswith
(
'#'
):
continue
# Try to match metric with labels first
match
=
metric_with_labels
.
match
(
line
)
if
match
:
metric_name
,
labels_part
,
value_str
=
match
.
groups
()
try
:
value
=
float
(
value_str
)
if
metric_name
not
in
metrics
:
metrics
[
metric_name
]
=
{}
metrics
[
metric_name
][
f
'{{
{
labels_part
}
}}'
]
=
value
except
ValueError
:
continue
else
:
# Try simple metric without labels
match
=
metric_simple
.
match
(
line
)
if
match
:
metric_name
,
value_str
=
match
.
groups
()
try
:
value
=
float
(
value_str
)
if
metric_name
not
in
metrics
:
metrics
[
metric_name
]
=
{}
metrics
[
metric_name
][
''
]
=
value
except
ValueError
:
continue
return
metrics
except
Exception
as
e
:
pytest
.
fail
(
f
"Failed to fetch Prometheus metrics:
{
e
}
"
)
return
{}
def
get_engine_request_counts
(
metrics
:
dict
[
str
,
dict
[
str
,
float
]])
->
dict
[
str
,
float
]:
"""Extract request counts per engine from Prometheus metrics.
Returns:
Dict mapping engine indices to request counts.
For example: {"0": 15.0, "1": 12.0}
"""
engine_counts
=
{}
# Look for request success metrics with engine labels
success_metrics
=
metrics
.
get
(
"vllm:request_success_total"
,
{})
engine_pattern
=
re
.
compile
(
r
'engine="([^"]*)"'
)
for
labels
,
count
in
success_metrics
.
items
():
# Extract engine ID from labels using regex
match
=
engine_pattern
.
search
(
labels
)
if
match
:
engine_id
=
match
.
group
(
1
)
if
engine_id
not
in
engine_counts
:
engine_counts
[
engine_id
]
=
0.0
engine_counts
[
engine_id
]
+=
count
return
engine_counts
def
check_request_balancing
(
server
:
RemoteOpenAIServer
):
"""Check request balancing via Prometheus metrics if DP_SIZE > 1.
Args:
server: The RemoteOpenAIServer instance
"""
dp_size
=
int
(
DP_SIZE
)
if
dp_size
<=
1
:
return
# Get metrics after all requests are completed
metrics
=
get_prometheus_metrics
(
server
)
engine_counts
=
get_engine_request_counts
(
metrics
)
# Check that multiple engines received requests
engines_with_requests
=
[
engine
for
engine
,
count
in
engine_counts
.
items
()
if
count
>
0
]
assert
len
(
engines_with_requests
)
==
dp_size
,
(
f
"Expected requests to be distributed across multiple engines,"
f
" but only engine(s)
{
engines_with_requests
}
received "
f
"requests. Engine counts:
{
engine_counts
}
"
)
# Verify that the load is reasonably balanced
# (no engine should handle all requests)
total_requests
=
sum
(
engine_counts
.
values
())
for
count
in
engine_counts
.
values
():
assert
count
>
total_requests
//
(
dp_size
+
1
),
(
f
"requests are imbalanced:
{
engine_counts
}
"
)
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
():
def
default_server_args
():
return
[
return
[
...
@@ -217,7 +100,7 @@ async def test_single_completion(client: openai.AsyncOpenAI,
...
@@ -217,7 +100,7 @@ async def test_single_completion(client: openai.AsyncOpenAI,
assert
all
(
completion
is
not
None
for
completion
in
results
)
assert
all
(
completion
is
not
None
for
completion
in
results
)
# Check request balancing via Prometheus metrics if DP_SIZE > 1
# Check request balancing via Prometheus metrics if DP_SIZE > 1
check_request_balancing
(
server
)
check_request_balancing
(
server
,
int
(
DP_SIZE
)
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -295,4 +178,4 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
...
@@ -295,4 +178,4 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
assert
all
(
results
),
"Not all streaming requests completed successfully."
assert
all
(
results
),
"Not all streaming requests completed successfully."
# Check request balancing via Prometheus metrics if DP_SIZE > 1
# Check request balancing via Prometheus metrics if DP_SIZE > 1
check_request_balancing
(
server
)
check_request_balancing
(
server
,
int
(
DP_SIZE
)
)
tests/v1/test_internal_lb_dp.py
0 → 100644
View file @
316b1bf7
This diff is collapsed.
Click to expand it.
tests/v1/test_utils.py
View file @
316b1bf7
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
re
import
pytest
import
requests
import
torch
import
torch
from
tests.utils
import
RemoteOpenAIServer
from
vllm.v1.worker.utils
import
bind_kv_cache
from
vllm.v1.worker.utils
import
bind_kv_cache
...
@@ -61,3 +66,122 @@ def test_bind_kv_cache_non_attention():
...
@@ -61,3 +66,122 @@ def test_bind_kv_cache_non_attention():
assert
runner_kv_caches
[
0
]
is
kv_cache
[
'model.layers.20.attn'
]
assert
runner_kv_caches
[
0
]
is
kv_cache
[
'model.layers.20.attn'
]
assert
runner_kv_caches
[
1
]
is
kv_cache
[
'model.layers.28.attn'
]
assert
runner_kv_caches
[
1
]
is
kv_cache
[
'model.layers.28.attn'
]
# Prometheus metrics utilities for testing
def
get_prometheus_metrics
(
server
:
RemoteOpenAIServer
)
->
dict
[
str
,
dict
[
str
,
float
]]:
"""Fetch and parse Prometheus metrics from the /metrics endpoint.
Returns:
Dict mapping metric names to their values grouped by labels.
For example: {"vllm:request_success": {
"engine=0": 5.0, "engine=1": 3.0}
}
"""
try
:
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
),
timeout
=
10
)
response
.
raise_for_status
()
metrics
:
dict
[
str
,
dict
[
str
,
float
]]
=
{}
# Regex patterns for Prometheus metrics
metric_with_labels
=
re
.
compile
(
r
'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$'
)
metric_simple
=
re
.
compile
(
r
'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$'
)
for
line
in
response
.
text
.
split
(
'
\n
'
):
line
=
line
.
strip
()
# Skip comments and empty lines
if
not
line
or
line
.
startswith
(
'#'
):
continue
# Try to match metric with labels first
match
=
metric_with_labels
.
match
(
line
)
if
match
:
metric_name
,
labels_part
,
value_str
=
match
.
groups
()
try
:
value
=
float
(
value_str
)
if
metric_name
not
in
metrics
:
metrics
[
metric_name
]
=
{}
metrics
[
metric_name
][
f
'{{
{
labels_part
}
}}'
]
=
value
except
ValueError
:
continue
else
:
# Try simple metric without labels
match
=
metric_simple
.
match
(
line
)
if
match
:
metric_name
,
value_str
=
match
.
groups
()
try
:
value
=
float
(
value_str
)
if
metric_name
not
in
metrics
:
metrics
[
metric_name
]
=
{}
metrics
[
metric_name
][
''
]
=
value
except
ValueError
:
continue
return
metrics
except
Exception
as
e
:
pytest
.
fail
(
f
"Failed to fetch Prometheus metrics:
{
e
}
"
)
return
{}
def
get_engine_request_counts
(
metrics
:
dict
[
str
,
dict
[
str
,
float
]])
->
dict
[
str
,
float
]:
"""Extract request counts per engine from Prometheus metrics.
Returns:
Dict mapping engine indices to request counts.
For example: {"0": 15.0, "1": 12.0}
"""
engine_counts
=
{}
# Look for request success metrics with engine labels
success_metrics
=
metrics
.
get
(
"vllm:request_success_total"
,
{})
engine_pattern
=
re
.
compile
(
r
'engine="([^"]*)"'
)
for
labels
,
count
in
success_metrics
.
items
():
# Extract engine ID from labels using regex
match
=
engine_pattern
.
search
(
labels
)
if
match
:
engine_id
=
match
.
group
(
1
)
if
engine_id
not
in
engine_counts
:
engine_counts
[
engine_id
]
=
0.0
engine_counts
[
engine_id
]
+=
count
return
engine_counts
def
check_request_balancing
(
server
:
RemoteOpenAIServer
,
dp_size
:
int
):
"""Check request balancing via Prometheus metrics if dp_size > 1.
Args:
server: The RemoteOpenAIServer instance
dp_size: Number of data parallel ranks
"""
if
dp_size
<=
1
:
return
# Get metrics after all requests are completed
metrics
=
get_prometheus_metrics
(
server
)
engine_counts
=
get_engine_request_counts
(
metrics
)
# Check that multiple engines received requests
engines_with_requests
=
[
engine
for
engine
,
count
in
engine_counts
.
items
()
if
count
>
0
]
assert
len
(
engines_with_requests
)
==
dp_size
,
(
f
"Expected requests to be distributed across multiple engines,"
f
" but only engine(s)
{
engines_with_requests
}
received "
f
"requests. Engine counts:
{
engine_counts
}
"
)
# Verify that the load is reasonably balanced
# (no engine should handle all requests)
total_requests
=
sum
(
engine_counts
.
values
())
for
count
in
engine_counts
.
values
():
assert
count
>
total_requests
//
(
dp_size
+
1
),
(
f
"requests are imbalanced:
{
engine_counts
}
"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment