Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4f9b14c2
Unverified
Commit
4f9b14c2
authored
Mar 16, 2026
by
Andreas Karatzas
Committed by
GitHub
Mar 16, 2026
Browse files
[CI] Stabilize multinode DP internal LB completion tests (#36356)
Signed-off-by:
Andreas Karatzas
<
akaratza@amd.com
>
parent
31a458c0
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
89 additions
and
94 deletions
+89
-94
tests/v1/distributed/test_internal_lb_dp.py
tests/v1/distributed/test_internal_lb_dp.py
+89
-94
No files found.
tests/v1/distributed/test_internal_lb_dp.py
View file @
4f9b14c2
...
...
@@ -12,7 +12,7 @@ import pytest
import
pytest_asyncio
import
requests
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
ROCM_ENV_OVERRIDES
,
RemoteOpenAIServer
from
tests.v1.utils
import
check_request_balancing
from
vllm.platforms
import
current_platform
...
...
@@ -27,6 +27,84 @@ TP_SIZE = int(os.getenv("TP_SIZE", "1"))
NUM_NODES
=
2
async
def
_make_completion_request
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
)
->
openai
.
types
.
Completion
:
"""Make a single completion request and validate the response.
Uses temperature=1.0 to ensure diverse outputs across concurrent
requests for realistic load balancer testing.
"""
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
1.0
,
)
assert
completion
.
id
is
not
None
,
(
f
"Expected non-None completion id. usage=
{
completion
.
usage
!
r
}
"
)
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
,
(
f
"Expected 1 choice, got "
f
"
{
len
(
completion
.
choices
)
if
completion
.
choices
else
'None'
}
"
)
choice
=
completion
.
choices
[
0
]
# With temperature=1.0, the model may emit a stop token immediately,
# producing empty text with finish_reason='stop'. This is valid
# model behavior - the test's purpose is load balancing, not output
# quality.
assert
choice
.
finish_reason
in
(
"length"
,
"stop"
),
(
f
"Expected finish_reason 'length' or 'stop', "
f
"got
{
choice
.
finish_reason
!
r
}
. text=
{
choice
.
text
!
r
}
"
)
if
choice
.
finish_reason
==
"length"
:
assert
len
(
choice
.
text
)
>=
1
,
(
f
"Expected non-empty text with finish_reason='length', got
{
choice
.
text
!
r
}
"
)
assert
completion
.
usage
.
prompt_tokens
>
0
,
(
f
"Expected positive prompt_tokens, got
{
completion
.
usage
.
prompt_tokens
}
"
)
assert
completion
.
usage
.
total_tokens
>
0
,
(
f
"Expected positive total_tokens, got
{
completion
.
usage
.
total_tokens
}
"
)
return
completion
async
def
_run_request_bursts
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
num_requests
:
int
=
200
,
num_bursts
:
int
=
2
,
):
"""Send multiple bursts of completion requests and validate all succeed."""
for
burst
in
range
(
num_bursts
):
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
_make_completion_request
(
client
,
model_name
))
)
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
,
return_exceptions
=
True
)
assert
len
(
results
)
==
num_requests
,
(
f
"Burst
{
burst
}
: expected
{
num_requests
}
results, got
{
len
(
results
)
}
"
)
for
result
in
results
:
if
isinstance
(
result
,
BaseException
):
raise
result
assert
all
(
completion
is
not
None
for
completion
in
results
),
(
f
"Burst
{
burst
}
: some completions were None"
)
await
asyncio
.
sleep
(
0.5
)
class
MultinodeInternalLBServerManager
:
"""Manages multi-node data parallel vLLM server instances for internal
load balancer testing using --headless mode."""
...
...
@@ -108,6 +186,7 @@ class MultinodeInternalLBServerManager:
auto_port
=
False
,
env_dict
=
{
"VLLM_SERVER_DEV_MODE"
:
"1"
,
**
ROCM_ENV_OVERRIDES
,
current_platform
.
device_control_env_var
:
","
.
join
(
str
(
current_platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
r
,
r
+
gpus_per_node
)
...
...
@@ -229,6 +308,7 @@ class APIOnlyServerManager:
auto_port
=
False
,
env_dict
=
{
"VLLM_SERVER_DEV_MODE"
:
"1"
,
**
ROCM_ENV_OVERRIDES
,
# No GPUs needed for API-only server
},
)
...
...
@@ -249,10 +329,11 @@ class APIOnlyServerManager:
engines_server_args
,
auto_port
=
False
,
env_dict
=
{
**
ROCM_ENV_OVERRIDES
,
current_platform
.
device_control_env_var
:
","
.
join
(
str
(
current_platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
self
.
dp_size
*
self
.
tp_size
)
)
)
,
},
)
server
.
__enter__
()
...
...
@@ -395,58 +476,15 @@ async def test_multinode_dp_completion(
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]],
model_name
:
str
,
)
->
None
:
async
def
make_request
():
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
1.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert
len
(
choice
.
text
)
>=
1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert
choice
.
finish_reason
in
(
"length"
,
"stop"
)
# Token counts can also vary, so we check they are positive.
assert
completion
.
usage
.
completion_tokens
>
0
assert
completion
.
usage
.
prompt_tokens
>
0
assert
completion
.
usage
.
total_tokens
>
0
return
completion
# Test single request
result
=
await
make_
request
(
)
result
=
await
_
make_
completion_request
(
client
,
model_name
)
assert
result
is
not
None
print
(
"Multi-node internal LB handled single completion request successfully"
)
await
asyncio
.
sleep
(
0.5
)
# Send multiple requests - internal LB should distribute across DP ranks
num_requests
=
200
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of requests
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
# Send multiple bursts - internal LB should distribute across DP ranks
await
_run_request_bursts
(
client
,
model_name
)
_
,
server_args
=
servers
[
0
]
api_server_count
=
(
...
...
@@ -570,59 +608,16 @@ async def test_api_only_multinode_dp_completion(
)
->
None
:
"""Test API-only server with all engines on separate headless server."""
async
def
make_request
():
completion
=
await
api_only_client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
1.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert
len
(
choice
.
text
)
>=
1
# Finish reason might not always be 'length' if the model finishes
# early or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert
choice
.
finish_reason
in
(
"length"
,
"stop"
)
# Token counts can also vary, so we check they are positive.
assert
completion
.
usage
.
completion_tokens
>
0
assert
completion
.
usage
.
prompt_tokens
>
0
assert
completion
.
usage
.
total_tokens
>
0
return
completion
# Test single request
result
=
await
make_
request
(
)
result
=
await
_
make_
completion_request
(
api_only_client
,
model_name
)
assert
result
is
not
None
print
(
"API-only server handled single completion request successfully"
)
await
asyncio
.
sleep
(
0.5
)
# Send multiple
reque
sts - should be distributed across engines on
# Send multiple
bur
sts - should be distributed across engines on
# headless server
num_requests
=
200
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of requests
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
await
_run_request_bursts
(
api_only_client
,
model_name
)
api_server
,
api_server_args
=
api_only_servers
[
0
]
api_server_count
=
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment