Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e38e96a3
Unverified
Commit
e38e96a3
authored
Jul 25, 2025
by
Nick Hill
Committed by
GitHub
Jul 25, 2025
Browse files
[Tests] Harden DP tests (#21508)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
40d86ee4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
82 additions
and
55 deletions
+82
-55
tests/v1/test_external_lb_dp.py
tests/v1/test_external_lb_dp.py
+5
-4
tests/v1/test_hybrid_lb_dp.py
tests/v1/test_hybrid_lb_dp.py
+29
-31
tests/v1/test_internal_lb_dp.py
tests/v1/test_internal_lb_dp.py
+48
-20
No files found.
tests/v1/test_external_lb_dp.py
View file @
e38e96a3
...
@@ -11,7 +11,7 @@ import pytest
...
@@ -11,7 +11,7 @@ import pytest
import
pytest_asyncio
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.platforms
import
P
latform
from
vllm.platforms
import
current_p
latform
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
...
@@ -70,10 +70,11 @@ class ExternalLBServerManager:
...
@@ -70,10 +70,11 @@ class ExternalLBServerManager:
sargs
,
sargs
,
auto_port
=
False
,
auto_port
=
False
,
env_dict
=
{
env_dict
=
{
"CUDA_VISIBLE_DEVICES"
:
current_platform
.
device_control_env_var
:
","
.
join
(
","
.
join
(
str
(
Platform
.
device_id_to_physical_device_id
(
str
(
i
))
current_platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
r
*
TP_SIZE
,
(
r
+
1
)
*
TP_SIZE
))
for
i
in
range
(
r
*
TP_SIZE
,
(
r
+
1
)
*
TP_SIZE
))
})
})
server
.
__enter__
()
server
.
__enter__
()
...
...
tests/v1/test_hybrid_lb_dp.py
View file @
e38e96a3
...
@@ -12,7 +12,7 @@ import pytest_asyncio
...
@@ -12,7 +12,7 @@ import pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
tests.v1.test_utils
import
check_request_balancing
from
tests.v1.test_utils
import
check_request_balancing
from
vllm.platforms
import
P
latform
from
vllm.platforms
import
current_p
latform
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
...
@@ -92,10 +92,12 @@ class HybridLBServerManager:
...
@@ -92,10 +92,12 @@ class HybridLBServerManager:
sargs
,
sargs
,
auto_port
=
False
,
auto_port
=
False
,
env_dict
=
{
env_dict
=
{
"CUDA_VISIBLE_DEVICES"
:
current_platform
.
device_control_env_var
:
","
.
join
(
","
.
join
(
str
(
Platform
.
device_id_to_physical_device_id
(
str
(
i
))
for
i
in
range
(
gpu_start
,
gpu_end
))
current_platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
gpu_start
,
gpu_end
))
})
})
server
.
__enter__
()
server
.
__enter__
()
print
(
f
"Hybrid LB node
{
node
}
started successfully with "
print
(
f
"Hybrid LB node
{
node
}
started successfully with "
...
@@ -180,7 +182,7 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
...
@@ -180,7 +182,7 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
max_tokens
=
10
,
max_tokens
=
5
,
temperature
=
1.0
)
temperature
=
1.0
)
assert
completion
.
id
is
not
None
assert
completion
.
id
is
not
None
...
@@ -212,27 +214,28 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
...
@@ -212,27 +214,28 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Send requests to all nodes - each should balance within its local DP ranks
# Send requests to all nodes - each should balance within its local DP ranks
num_requests
_per_node
=
2
5
# Total
5
0 requests across 2 nodes
num_requests
=
2
00
# Total
20
0 requests across 2 nodes
all_tasks
=
[]
all_tasks
=
[]
for
i
in
range
(
num_requests
):
for
i
,
client
in
enumerate
(
clients
)
:
client
=
clients
[
i
%
len
(
clients
)
]
tasks
=
[
make_request
(
client
)
for
_
in
range
(
num_requests_per_node
)]
all_
tasks
.
append
(
asyncio
.
create_task
(
make_request
(
client
)))
a
ll_tasks
.
extend
(
tasks
)
a
wait
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
_per_node
*
len
(
clients
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
assert
all
(
completion
is
not
None
for
completion
in
results
)
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of requests
# Second burst of requests
all_tasks
=
[]
all_tasks
=
[]
for
i
,
client
in
enumerate
(
clients
):
for
i
in
range
(
num_requests
):
tasks
=
[
make_request
(
client
)
for
_
in
range
(
num_requests_per_node
)]
client
=
clients
[
i
%
len
(
clients
)]
all_tasks
.
extend
(
tasks
)
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
(
client
)))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
_per_node
*
len
(
clients
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
assert
all
(
completion
is
not
None
for
completion
in
results
)
_
,
server_args
=
servers
[
0
]
_
,
server_args
=
servers
[
0
]
...
@@ -309,33 +312,28 @@ async def test_hybrid_lb_completion_streaming(clients: list[
...
@@ -309,33 +312,28 @@ async def test_hybrid_lb_completion_streaming(clients: list[
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Send streaming requests to all nodes
# Send streaming requests to all nodes
num_requests
_per_node
=
2
5
# Total
5
0 requests across 2 nodes
num_requests
=
2
00
# Total
20
0 requests across 2 nodes
all_tasks
=
[]
all_tasks
=
[]
for
i
in
range
(
num_requests
):
for
i
,
client
in
enumerate
(
clients
):
client
=
clients
[
i
%
len
(
clients
)]
tasks
=
[
all_tasks
.
append
(
asyncio
.
create_task
(
make_streaming_request
(
client
)))
make_streaming_request
(
client
)
await
asyncio
.
sleep
(
0.01
)
for
_
in
range
(
num_requests_per_node
)
]
all_tasks
.
extend
(
tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
_per_node
*
len
(
clients
)
assert
len
(
results
)
==
num_requests
assert
all
(
results
),
"Not all streaming requests completed successfully."
assert
all
(
results
),
"Not all streaming requests completed successfully."
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of streaming requests
# Second burst of streaming requests
all_tasks
=
[]
all_tasks
=
[]
for
i
,
client
in
enumerate
(
clients
):
for
i
in
range
(
num_requests
):
tasks
=
[
client
=
clients
[
i
%
len
(
clients
)]
make_streaming_request
(
client
)
all_tasks
.
append
(
asyncio
.
create_task
(
make_streaming_request
(
client
)))
for
_
in
range
(
num_requests_per_node
)
await
asyncio
.
sleep
(
0.01
)
]
all_tasks
.
extend
(
tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
_per_node
*
len
(
clients
)
assert
len
(
results
)
==
num_requests
assert
all
(
results
),
"Not all streaming requests completed successfully."
assert
all
(
results
),
"Not all streaming requests completed successfully."
_
,
server_args
=
servers
[
0
]
_
,
server_args
=
servers
[
0
]
...
...
tests/v1/test_internal_lb_dp.py
View file @
e38e96a3
...
@@ -11,7 +11,7 @@ import pytest_asyncio
...
@@ -11,7 +11,7 @@ import pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
tests.v1.test_utils
import
check_request_balancing
from
tests.v1.test_utils
import
check_request_balancing
from
vllm.platforms
import
P
latform
from
vllm.platforms
import
current_p
latform
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
...
@@ -96,10 +96,12 @@ class MultinodeInternalLBServerManager:
...
@@ -96,10 +96,12 @@ class MultinodeInternalLBServerManager:
sargs
,
sargs
,
auto_port
=
False
,
auto_port
=
False
,
env_dict
=
{
env_dict
=
{
"CUDA_VISIBLE_DEVICES"
:
current_platform
.
device_control_env_var
:
","
.
join
(
","
.
join
(
str
(
Platform
.
device_id_to_physical_device_id
(
str
(
i
))
for
i
in
range
(
r
,
r
+
gpus_per_node
))
current_platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
r
,
r
+
gpus_per_node
))
})
})
server
.
__enter__
()
server
.
__enter__
()
if
r
==
0
:
if
r
==
0
:
...
@@ -219,9 +221,11 @@ class APIOnlyServerManager:
...
@@ -219,9 +221,11 @@ class APIOnlyServerManager:
engines_server_args
,
engines_server_args
,
auto_port
=
False
,
auto_port
=
False
,
env_dict
=
{
env_dict
=
{
"CUDA_VISIBLE_DEVICES"
:
current_platform
.
device_control_env_var
:
","
.
join
(
","
.
join
(
str
(
Platform
.
device_id_to_physical_device_id
(
i
))
str
(
current_platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
self
.
dp_size
*
self
.
tp_size
))
for
i
in
range
(
self
.
dp_size
*
self
.
tp_size
))
})
})
server
.
__enter__
()
server
.
__enter__
()
...
@@ -330,7 +334,7 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
...
@@ -330,7 +334,7 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
max_tokens
=
10
,
max_tokens
=
5
,
temperature
=
1.0
)
temperature
=
1.0
)
assert
completion
.
id
is
not
None
assert
completion
.
id
is
not
None
...
@@ -361,8 +365,11 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
...
@@ -361,8 +365,11 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Send multiple requests - internal LB should distribute across DP ranks
# Send multiple requests - internal LB should distribute across DP ranks
num_requests
=
50
num_requests
=
200
all_tasks
=
[
make_request
()
for
_
in
range
(
num_requests
)]
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
len
(
results
)
==
num_requests
...
@@ -371,7 +378,10 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
...
@@ -371,7 +378,10 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of requests
# Second burst of requests
all_tasks
=
[
make_request
()
for
_
in
range
(
num_requests
)]
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
len
(
results
)
==
num_requests
...
@@ -449,8 +459,11 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
...
@@ -449,8 +459,11 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
# Send multiple streaming requests - internal LB should distribute across
# Send multiple streaming requests - internal LB should distribute across
# DP ranks
# DP ranks
num_requests
=
50
num_requests
=
200
all_tasks
=
[
make_streaming_request
()
for
_
in
range
(
num_requests
)]
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_streaming_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
len
(
results
)
==
num_requests
...
@@ -459,7 +472,10 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
...
@@ -459,7 +472,10 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of streaming requests
# Second burst of streaming requests
all_tasks
=
[
make_streaming_request
()
for
_
in
range
(
num_requests
)]
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_streaming_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
len
(
results
)
==
num_requests
...
@@ -492,7 +508,7 @@ async def test_api_only_multinode_dp_completion(
...
@@ -492,7 +508,7 @@ async def test_api_only_multinode_dp_completion(
completion
=
await
api_only_client
.
completions
.
create
(
completion
=
await
api_only_client
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
max_tokens
=
10
,
max_tokens
=
5
,
temperature
=
1.0
)
temperature
=
1.0
)
assert
completion
.
id
is
not
None
assert
completion
.
id
is
not
None
...
@@ -522,8 +538,11 @@ async def test_api_only_multinode_dp_completion(
...
@@ -522,8 +538,11 @@ async def test_api_only_multinode_dp_completion(
# Send multiple requests - should be distributed across engines on
# Send multiple requests - should be distributed across engines on
# headless server
# headless server
num_requests
=
50
num_requests
=
200
all_tasks
=
[
make_request
()
for
_
in
range
(
num_requests
)]
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
len
(
results
)
==
num_requests
...
@@ -532,7 +551,10 @@ async def test_api_only_multinode_dp_completion(
...
@@ -532,7 +551,10 @@ async def test_api_only_multinode_dp_completion(
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of requests
# Second burst of requests
all_tasks
=
[
make_request
()
for
_
in
range
(
num_requests
)]
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
len
(
results
)
==
num_requests
...
@@ -610,8 +632,11 @@ async def test_api_only_multinode_dp_completion_streaming(
...
@@ -610,8 +632,11 @@ async def test_api_only_multinode_dp_completion_streaming(
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Send multiple streaming requests - should be distributed across engines
# Send multiple streaming requests - should be distributed across engines
num_requests
=
50
num_requests
=
200
all_tasks
=
[
make_streaming_request
()
for
_
in
range
(
num_requests
)]
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_streaming_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
len
(
results
)
==
num_requests
...
@@ -620,7 +645,10 @@ async def test_api_only_multinode_dp_completion_streaming(
...
@@ -620,7 +645,10 @@ async def test_api_only_multinode_dp_completion_streaming(
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of streaming requests
# Second burst of streaming requests
all_tasks
=
[
make_streaming_request
()
for
_
in
range
(
num_requests
)]
all_tasks
=
[]
for
_
in
range
(
num_requests
):
all_tasks
.
append
(
asyncio
.
create_task
(
make_streaming_request
()))
await
asyncio
.
sleep
(
0.01
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
len
(
results
)
==
num_requests
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment