Unverified Commit e38e96a3 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[Tests] Harden DP tests (#21508)


Signed-off-by: default avatarNick Hill <nhill@redhat.com>
parent 40d86ee4
...@@ -11,7 +11,7 @@ import pytest ...@@ -11,7 +11,7 @@ import pytest
import pytest_asyncio import pytest_asyncio
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.platforms import Platform from vllm.platforms import current_platform
MODEL_NAME = "ibm-research/PowerMoE-3b" MODEL_NAME = "ibm-research/PowerMoE-3b"
...@@ -70,10 +70,11 @@ class ExternalLBServerManager: ...@@ -70,10 +70,11 @@ class ExternalLBServerManager:
sargs, sargs,
auto_port=False, auto_port=False,
env_dict={ env_dict={
"CUDA_VISIBLE_DEVICES": current_platform.device_control_env_var:
",".join( ",".join(
str(Platform.device_id_to_physical_device_id( str(
i)) current_platform.
device_id_to_physical_device_id(i))
for i in range(r * TP_SIZE, (r + 1) * TP_SIZE)) for i in range(r * TP_SIZE, (r + 1) * TP_SIZE))
}) })
server.__enter__() server.__enter__()
......
...@@ -12,7 +12,7 @@ import pytest_asyncio ...@@ -12,7 +12,7 @@ import pytest_asyncio
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from tests.v1.test_utils import check_request_balancing from tests.v1.test_utils import check_request_balancing
from vllm.platforms import Platform from vllm.platforms import current_platform
MODEL_NAME = "ibm-research/PowerMoE-3b" MODEL_NAME = "ibm-research/PowerMoE-3b"
...@@ -92,10 +92,12 @@ class HybridLBServerManager: ...@@ -92,10 +92,12 @@ class HybridLBServerManager:
sargs, sargs,
auto_port=False, auto_port=False,
env_dict={ env_dict={
"CUDA_VISIBLE_DEVICES": current_platform.device_control_env_var:
",".join( ",".join(
str(Platform.device_id_to_physical_device_id( str(
i)) for i in range(gpu_start, gpu_end)) current_platform.
device_id_to_physical_device_id(i))
for i in range(gpu_start, gpu_end))
}) })
server.__enter__() server.__enter__()
print(f"Hybrid LB node {node} started successfully with " print(f"Hybrid LB node {node} started successfully with "
...@@ -180,7 +182,7 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI], ...@@ -180,7 +182,7 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
completion = await client.completions.create( completion = await client.completions.create(
model=model_name, model=model_name,
prompt="Hello, my name is", prompt="Hello, my name is",
max_tokens=10, max_tokens=5,
temperature=1.0) temperature=1.0)
assert completion.id is not None assert completion.id is not None
...@@ -212,27 +214,28 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI], ...@@ -212,27 +214,28 @@ async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Send requests to all nodes - each should balance within its local DP ranks # Send requests to all nodes - each should balance within its local DP ranks
num_requests_per_node = 25 # Total 50 requests across 2 nodes num_requests = 200 # Total 200 requests across 2 nodes
all_tasks = [] all_tasks = []
for i in range(num_requests):
for i, client in enumerate(clients): client = clients[i % len(clients)]
tasks = [make_request(client) for _ in range(num_requests_per_node)] all_tasks.append(asyncio.create_task(make_request(client)))
all_tasks.extend(tasks) await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_node * len(clients) assert len(results) == num_requests
assert all(completion is not None for completion in results) assert all(completion is not None for completion in results)
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Second burst of requests # Second burst of requests
all_tasks = [] all_tasks = []
for i, client in enumerate(clients): for i in range(num_requests):
tasks = [make_request(client) for _ in range(num_requests_per_node)] client = clients[i % len(clients)]
all_tasks.extend(tasks) all_tasks.append(asyncio.create_task(make_request(client)))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_node * len(clients) assert len(results) == num_requests
assert all(completion is not None for completion in results) assert all(completion is not None for completion in results)
_, server_args = servers[0] _, server_args = servers[0]
...@@ -309,33 +312,28 @@ async def test_hybrid_lb_completion_streaming(clients: list[ ...@@ -309,33 +312,28 @@ async def test_hybrid_lb_completion_streaming(clients: list[
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Send streaming requests to all nodes # Send streaming requests to all nodes
num_requests_per_node = 25 # Total 50 requests across 2 nodes num_requests = 200 # Total 200 requests across 2 nodes
all_tasks = [] all_tasks = []
for i in range(num_requests):
for i, client in enumerate(clients): client = clients[i % len(clients)]
tasks = [ all_tasks.append(asyncio.create_task(make_streaming_request(client)))
make_streaming_request(client) await asyncio.sleep(0.01)
for _ in range(num_requests_per_node)
]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_node * len(clients) assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully." assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Second burst of streaming requests # Second burst of streaming requests
all_tasks = [] all_tasks = []
for i, client in enumerate(clients): for i in range(num_requests):
tasks = [ client = clients[i % len(clients)]
make_streaming_request(client) all_tasks.append(asyncio.create_task(make_streaming_request(client)))
for _ in range(num_requests_per_node) await asyncio.sleep(0.01)
]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_node * len(clients) assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully." assert all(results), "Not all streaming requests completed successfully."
_, server_args = servers[0] _, server_args = servers[0]
......
...@@ -11,7 +11,7 @@ import pytest_asyncio ...@@ -11,7 +11,7 @@ import pytest_asyncio
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from tests.v1.test_utils import check_request_balancing from tests.v1.test_utils import check_request_balancing
from vllm.platforms import Platform from vllm.platforms import current_platform
MODEL_NAME = "ibm-research/PowerMoE-3b" MODEL_NAME = "ibm-research/PowerMoE-3b"
...@@ -96,10 +96,12 @@ class MultinodeInternalLBServerManager: ...@@ -96,10 +96,12 @@ class MultinodeInternalLBServerManager:
sargs, sargs,
auto_port=False, auto_port=False,
env_dict={ env_dict={
"CUDA_VISIBLE_DEVICES": current_platform.device_control_env_var:
",".join( ",".join(
str(Platform.device_id_to_physical_device_id( str(
i)) for i in range(r, r + gpus_per_node)) current_platform.
device_id_to_physical_device_id(i))
for i in range(r, r + gpus_per_node))
}) })
server.__enter__() server.__enter__()
if r == 0: if r == 0:
...@@ -219,9 +221,11 @@ class APIOnlyServerManager: ...@@ -219,9 +221,11 @@ class APIOnlyServerManager:
engines_server_args, engines_server_args,
auto_port=False, auto_port=False,
env_dict={ env_dict={
"CUDA_VISIBLE_DEVICES": current_platform.device_control_env_var:
",".join( ",".join(
str(Platform.device_id_to_physical_device_id(i)) str(
current_platform.
device_id_to_physical_device_id(i))
for i in range(self.dp_size * self.tp_size)) for i in range(self.dp_size * self.tp_size))
}) })
server.__enter__() server.__enter__()
...@@ -330,7 +334,7 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI, ...@@ -330,7 +334,7 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
completion = await client.completions.create( completion = await client.completions.create(
model=model_name, model=model_name,
prompt="Hello, my name is", prompt="Hello, my name is",
max_tokens=10, max_tokens=5,
temperature=1.0) temperature=1.0)
assert completion.id is not None assert completion.id is not None
...@@ -361,8 +365,11 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI, ...@@ -361,8 +365,11 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Send multiple requests - internal LB should distribute across DP ranks # Send multiple requests - internal LB should distribute across DP ranks
num_requests = 50 num_requests = 200
all_tasks = [make_request() for _ in range(num_requests)] all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests assert len(results) == num_requests
...@@ -371,7 +378,10 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI, ...@@ -371,7 +378,10 @@ async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Second burst of requests # Second burst of requests
all_tasks = [make_request() for _ in range(num_requests)] all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests assert len(results) == num_requests
...@@ -449,8 +459,11 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI, ...@@ -449,8 +459,11 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
# Send multiple streaming requests - internal LB should distribute across # Send multiple streaming requests - internal LB should distribute across
# DP ranks # DP ranks
num_requests = 50 num_requests = 200
all_tasks = [make_streaming_request() for _ in range(num_requests)] all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_streaming_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests assert len(results) == num_requests
...@@ -459,7 +472,10 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI, ...@@ -459,7 +472,10 @@ async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Second burst of streaming requests # Second burst of streaming requests
all_tasks = [make_streaming_request() for _ in range(num_requests)] all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_streaming_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests assert len(results) == num_requests
...@@ -492,7 +508,7 @@ async def test_api_only_multinode_dp_completion( ...@@ -492,7 +508,7 @@ async def test_api_only_multinode_dp_completion(
completion = await api_only_client.completions.create( completion = await api_only_client.completions.create(
model=model_name, model=model_name,
prompt="Hello, my name is", prompt="Hello, my name is",
max_tokens=10, max_tokens=5,
temperature=1.0) temperature=1.0)
assert completion.id is not None assert completion.id is not None
...@@ -522,8 +538,11 @@ async def test_api_only_multinode_dp_completion( ...@@ -522,8 +538,11 @@ async def test_api_only_multinode_dp_completion(
# Send multiple requests - should be distributed across engines on # Send multiple requests - should be distributed across engines on
# headless server # headless server
num_requests = 50 num_requests = 200
all_tasks = [make_request() for _ in range(num_requests)] all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests assert len(results) == num_requests
...@@ -532,7 +551,10 @@ async def test_api_only_multinode_dp_completion( ...@@ -532,7 +551,10 @@ async def test_api_only_multinode_dp_completion(
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Second burst of requests # Second burst of requests
all_tasks = [make_request() for _ in range(num_requests)] all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests assert len(results) == num_requests
...@@ -610,8 +632,11 @@ async def test_api_only_multinode_dp_completion_streaming( ...@@ -610,8 +632,11 @@ async def test_api_only_multinode_dp_completion_streaming(
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Send multiple streaming requests - should be distributed across engines # Send multiple streaming requests - should be distributed across engines
num_requests = 50 num_requests = 200
all_tasks = [make_streaming_request() for _ in range(num_requests)] all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_streaming_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests assert len(results) == num_requests
...@@ -620,7 +645,10 @@ async def test_api_only_multinode_dp_completion_streaming( ...@@ -620,7 +645,10 @@ async def test_api_only_multinode_dp_completion_streaming(
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Second burst of streaming requests # Second burst of streaming requests
all_tasks = [make_streaming_request() for _ in range(num_requests)] all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_streaming_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks) results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests assert len(results) == num_requests
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment