"examples/vscode:/vscode.git/clone" did not exist on "0f2fa9282858d7cc422a0f1bdd08684e5e703d6a"
Commit 711aa9d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.0' into v0.10.0-dev

parents 751c492c 6d8d0a24
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
# Use a small reasoning model to test the responses API.
MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def default_server_args():
return [
"--max-model-len",
"8192",
"--enforce-eager", # For faster startup.
"--reasoning-parser",
"deepseek_r1",
]
@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai # use the official client for correctness check
import pytest
@pytest.mark.asyncio
async def test_simple_input(client: openai.AsyncOpenAI):
response = await client.responses.create(input="What is 13 * 24?")
print(response)
outputs = response.output
# Whether the output contains the answer.
assert outputs[-1].type == "message"
assert "312" in outputs[-1].content[0].text
# Whether the output contains the reasoning.
assert outputs[0].type == "reasoning"
assert outputs[0].text != ""
@pytest.mark.asyncio
async def test_instructions(client: openai.AsyncOpenAI):
response = await client.responses.create(
instructions="Finish the answer with QED.",
input="What is 13 * 24?",
)
print(response)
output_text = response.output[-1].content[0].text
assert "312" in output_text
assert "QED" in output_text
@pytest.mark.asyncio
async def test_chat(client: openai.AsyncOpenAI):
response = await client.responses.create(input=[
{
"role": "system",
"content": "Finish the answer with QED."
},
{
"role": "user",
"content": "What is 5 * 3?"
},
{
"role": "assistant",
"content": "15. QED."
},
{
"role": "user",
"content": "Multiply the result by 2."
},
], )
print(response)
output_text = response.output[-1].content[0].text
assert "30" in output_text
assert "QED" in output_text
@pytest.mark.asyncio
async def test_chat_with_input_type(client: openai.AsyncOpenAI):
response = await client.responses.create(input=[
{
"role": "user",
"content": [{
"type": "input_text",
"text": "Hello!"
}],
},
], )
print(response)
assert response.status == "completed"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_base64, fetch_image
# Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
MAXIMUM_IMAGES = 2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
@pytest.fixture(scope="module")
def default_image_server_args():
return [
"--enforce-eager",
"--max-model-len",
"6000",
"--max-num-seqs",
"128",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
]
@pytest.fixture(scope="module")
def image_server(default_image_server_args):
with RemoteOpenAIServer(MODEL_NAME,
default_image_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(image_server):
async with image_server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_image() -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str):
content_text = "What's in this image?"
messages = [{
"role":
"user",
"content": [
{
"type": "input_image",
"image_url": image_url,
"detail": "auto",
},
{
"type": "input_text",
"text": content_text
},
],
}]
# test image url
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
image_url: str,
base64_encoded_image: dict[str, str],
):
content_text = "What's in this image?"
messages = [{
"role":
"user",
"content": [
{
"type": "input_image",
"image_url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
"detail": "auto",
},
{
"type": "input_text",
"text": content_text
},
],
}]
# test image base64
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
image_urls: list[str]):
messages = [{
"role":
"user",
"content": [
*({
"type": "input_image",
"image_url": image_url,
"detail": "auto",
} for image_url in image_urls),
{
"type": "input_text",
"text": "What's in this image?"
},
],
}]
if len(image_urls) > MAXIMUM_IMAGES:
with pytest.raises(openai.BadRequestError): # test multi-image input
await client.responses.create(
model=model_name,
input=messages,
)
# the server should still work afterwards
response = await client.responses.create(
model=model_name,
input=[{
"role": "user",
"content": "What's the weather like in Paris today?",
}],
)
assert len(response.output_text) > 0
else:
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import openai
import pytest
@pytest.mark.asyncio
async def test_store(client: openai.AsyncOpenAI):
# By default, store is True.
response = await client.responses.create(input="Hello!")
assert response.status == "completed"
# Retrieve the response.
response = await client.responses.retrieve(response.id)
assert response.status == "completed"
# Test store=False.
response = await client.responses.create(
input="Hello!",
store=False,
)
assert response.status == "completed"
# The response should not be found.
with pytest.raises(openai.NotFoundError,
match="Response with id .* not found."):
await client.responses.retrieve(response.id)
@pytest.mark.asyncio
async def test_background(client: openai.AsyncOpenAI):
# NOTE: This query should be easy enough for the model to answer
# within the 10 seconds.
response = await client.responses.create(
input="Hello!",
background=True,
)
assert response.status == "queued"
max_retries = 10
for _ in range(max_retries):
await asyncio.sleep(1)
response = await client.responses.retrieve(response.id)
if response.status != "queued":
break
print(response)
assert response.status == "completed"
@pytest.mark.asyncio
async def test_background_error(client: openai.AsyncOpenAI):
with pytest.raises(
openai.BadRequestError,
match="background can only be used when `store` is true"):
_ = await client.responses.create(
input="What is 13 * 24?",
background=True,
store=False,
)
@pytest.mark.asyncio
async def test_background_cancel(client: openai.AsyncOpenAI):
response = await client.responses.create(
input="Write a long story about a cat.",
background=True,
)
assert response.status == "queued"
# Cancel the response before it is completed.
# FIXME: This test can be flaky.
await asyncio.sleep(0.5)
response = await client.responses.cancel(response.id)
assert response.status == "cancelled"
# Make sure the response status remains unchanged.
await asyncio.sleep(5)
response = await client.responses.retrieve(response.id)
assert response.status == "cancelled"
@pytest.mark.asyncio
async def test_cancel_completed(client: openai.AsyncOpenAI):
response = await client.responses.create(input="Hello")
assert response.status == "completed"
with pytest.raises(openai.BadRequestError,
match="Cannot cancel a synchronous response."):
await client.responses.cancel(response.id)
@pytest.mark.asyncio
async def test_previous_response_id(client: openai.AsyncOpenAI):
response1 = await client.responses.create(
instructions="You are tested on your ability to retrieve the correct "
"information from the previous response.",
input="Hello, my name is John.")
response2 = await client.responses.create(
input="Actually, my name is not John. My real name is Mark.",
previous_response_id=response1.id,
)
response3 = await client.responses.create(
input="What is my real name again? Answer in one word.",
previous_response_id=response2.id,
)
print(response3)
assert "Mark" in response3.output[-1].content[0].text
assert "John" not in response3.output[-1].content[0].text
@pytest.mark.asyncio
async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
response1 = await client.responses.create(
instructions="You are tested on your ability to retrieve the correct "
"information from the previous response.",
input="Hello, my name is John.")
# Both response 2 and 3 use response 1 as the previous response.
response2 = client.responses.create(
input="Actually, my name is not John. My name is Mark.",
previous_response_id=response1.id,
)
response3 = client.responses.create(
input="What is my name again? Answer in one word.",
previous_response_id=response1.id,
)
_ = await response2
response3_result = await response3
print(response3_result)
assert "John" in response3_result.output[-1].content[0].text
assert "Mark" not in response3_result.output[-1].content[0].text
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai
import pytest
from pydantic import BaseModel
@pytest.mark.asyncio
async def test_structured_output(client: openai.AsyncOpenAI):
response = await client.responses.create(
input=[
{
"role": "system",
"content": "Extract the event information."
},
{
"role": "user",
"content":
"Alice and Bob are going to a science fair on Friday.",
},
],
text={
"format": {
"type": "json_schema",
"name": "calendar_event",
"schema": {
"type": "object",
"properties": {
"event_name": {
"type": "string"
},
"date": {
"type": "string"
},
"participants": {
"type": "array",
"items": {
"type": "string"
}
},
},
"required": ["event_name", "date", "participants"],
"additionalProperties": False,
},
"description": "A calendar event.",
"strict": True,
}
},
)
print(response)
# NOTE: The JSON schema is applied to the output text, not reasoning.
output_text = response.output[-1].content[0].text
event = json.loads(output_text)
assert event["event_name"].lower() == "science fair"
assert event["date"] == "Friday"
participants = event["participants"]
assert len(participants) == 2
assert participants[0] == "Alice"
assert participants[1] == "Bob"
@pytest.mark.asyncio
async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
class CalendarEvent(BaseModel):
event_name: str
date: str
participants: list[str]
response = await client.responses.parse(
model=None,
instructions="Extract the event information.",
input="Alice and Bob are going to a science fair on Friday.",
text_format=CalendarEvent,
)
print(response)
# The output is successfully parsed.
event = response.output_parsed
assert event is not None
# The output is correct.
assert event.event_name.lower() == "science fair"
assert event.date == "Friday"
participants = event.participants
assert len(participants) == 2
assert participants[0] == "Alice"
assert participants[1] == "Bob"
...@@ -7,6 +7,7 @@ import openai # use the official client for correctness check ...@@ -7,6 +7,7 @@ import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import regex as re import regex as re
import requests
from openai import BadRequestError from openai import BadRequestError
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
...@@ -26,7 +27,8 @@ def default_server_args(): ...@@ -26,7 +27,8 @@ def default_server_args():
"2048", "2048",
"--max-num-seqs", "--max-num-seqs",
"128", "128",
"--enforce-eager" "--enforce-eager",
"--enable-prompt-tokens-details",
] ]
...@@ -679,3 +681,17 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str): ...@@ -679,3 +681,17 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
prompt=prompt, prompt=prompt,
extra_body={"guided_grammar": invalid_simplified_sql_grammar}, extra_body={"guided_grammar": invalid_simplified_sql_grammar},
) )
@pytest.mark.asyncio
async def test_completion_with_empty_prompt_embeds(
client: openai.AsyncOpenAI) -> None:
"""Test completion with empty prompt embeds."""
payload: dict[str, list] = {"prompt_embeds": []}
headers: dict[str, str] = {"Content-Type": "application/json"}
# base_url = http://localhost:8000/v1/completions
response = requests.post(f"{client.base_url}completions",
headers=headers,
json=payload)
assert response.status_code == 200, (
f"Expected status code 200, got {response.status_code}. ")
...@@ -2,136 +2,19 @@ ...@@ -2,136 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import os import os
import re
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from tests.v1.test_utils import check_request_balancing
MODEL_NAME = "ibm-research/PowerMoE-3b" MODEL_NAME = "ibm-research/PowerMoE-3b"
DP_SIZE = os.getenv("DP_SIZE", "1") DP_SIZE = os.getenv("DP_SIZE", "1")
def get_prometheus_metrics(
server: RemoteOpenAIServer) -> dict[str, dict[str, float]]:
"""Fetch and parse Prometheus metrics from the /metrics endpoint.
Returns:
Dict mapping metric names to their values grouped by labels.
For example: {"vllm:request_success": {
"engine=0": 5.0, "engine=1": 3.0}
}
"""
try:
response = requests.get(server.url_for("metrics"), timeout=10)
response.raise_for_status()
metrics: dict[str, dict[str, float]] = {}
# Regex patterns for Prometheus metrics
metric_with_labels = re.compile(
r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$')
metric_simple = re.compile(
r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$')
for line in response.text.split('\n'):
line = line.strip()
# Skip comments and empty lines
if not line or line.startswith('#'):
continue
# Try to match metric with labels first
match = metric_with_labels.match(line)
if match:
metric_name, labels_part, value_str = match.groups()
try:
value = float(value_str)
if metric_name not in metrics:
metrics[metric_name] = {}
metrics[metric_name][f'{{{labels_part}}}'] = value
except ValueError:
continue
else:
# Try simple metric without labels
match = metric_simple.match(line)
if match:
metric_name, value_str = match.groups()
try:
value = float(value_str)
if metric_name not in metrics:
metrics[metric_name] = {}
metrics[metric_name][''] = value
except ValueError:
continue
return metrics
except Exception as e:
pytest.fail(f"Failed to fetch Prometheus metrics: {e}")
return {}
def get_engine_request_counts(
metrics: dict[str, dict[str, float]]) -> dict[str, float]:
"""Extract request counts per engine from Prometheus metrics.
Returns:
Dict mapping engine indices to request counts.
For example: {"0": 15.0, "1": 12.0}
"""
engine_counts = {}
# Look for request success metrics with engine labels
success_metrics = metrics.get("vllm:request_success_total", {})
engine_pattern = re.compile(r'engine="([^"]*)"')
for labels, count in success_metrics.items():
# Extract engine ID from labels using regex
match = engine_pattern.search(labels)
if match:
engine_id = match.group(1)
if engine_id not in engine_counts:
engine_counts[engine_id] = 0.0
engine_counts[engine_id] += count
return engine_counts
def check_request_balancing(server: RemoteOpenAIServer):
"""Check request balancing via Prometheus metrics if DP_SIZE > 1.
Args:
server: The RemoteOpenAIServer instance
"""
dp_size = int(DP_SIZE)
if dp_size <= 1:
return
# Get metrics after all requests are completed
metrics = get_prometheus_metrics(server)
engine_counts = get_engine_request_counts(metrics)
# Check that multiple engines received requests
engines_with_requests = [
engine for engine, count in engine_counts.items() if count > 0
]
assert len(engines_with_requests) == dp_size, (
f"Expected requests to be distributed across multiple engines,"
f" but only engine(s) {engines_with_requests} received "
f"requests. Engine counts: {engine_counts}")
# Verify that the load is reasonably balanced
# (no engine should handle all requests)
total_requests = sum(engine_counts.values())
for count in engine_counts.values():
assert count > total_requests // (dp_size + 1), (
f"requests are imbalanced: {engine_counts}")
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def default_server_args(): def default_server_args():
return [ return [
...@@ -217,7 +100,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, ...@@ -217,7 +100,7 @@ async def test_single_completion(client: openai.AsyncOpenAI,
assert all(completion is not None for completion in results) assert all(completion is not None for completion in results)
# Check request balancing via Prometheus metrics if DP_SIZE > 1 # Check request balancing via Prometheus metrics if DP_SIZE > 1
check_request_balancing(server) check_request_balancing(server, int(DP_SIZE))
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -295,4 +178,4 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, ...@@ -295,4 +178,4 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
assert all(results), "Not all streaming requests completed successfully." assert all(results), "Not all streaming requests completed successfully."
# Check request balancing via Prometheus metrics if DP_SIZE > 1 # Check request balancing via Prometheus metrics if DP_SIZE > 1
check_request_balancing(server) check_request_balancing(server, int(DP_SIZE))
...@@ -3,16 +3,10 @@ ...@@ -3,16 +3,10 @@
import filecmp import filecmp
import shutil import shutil
import tempfile import tempfile
from collections import defaultdict
from pathlib import Path from pathlib import Path
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig, VllmConfig from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory)
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa
SharedStorageConnector)
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
...@@ -25,62 +19,6 @@ PROMPTS = [ ...@@ -25,62 +19,6 @@ PROMPTS = [
SAMPLING_PARAMS = SamplingParams(temperature=0, max_tokens=20) SAMPLING_PARAMS = SamplingParams(temperature=0, max_tokens=20)
class TestSharedStorageConnector(SharedStorageConnector):
def __init__(self, config: VllmConfig, role):
self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
self._connector = SharedStorageConnector(config, role)
self.call_record: dict[str, int] = defaultdict(int)
# Use a unique temp file per connector
self._event_file = tempfile.gettempdir(
) + f"/connector_{self.name}-{self.role.name}_events.log"
# Start with an empty file
with open(self._event_file, "w") as _:
pass
def __getattribute__(self, name):
if name in ("_connector", "call_record", "name", "_event_file",
"__class__", "__dict__", "__getattribute__",
"__init__"): # avoid recursion
return object.__getattribute__(self, name)
if not hasattr(self._connector, name):
return object.__getattribute__(self, name)
attr = getattr(self._connector, name)
# Intercept calls to the connector interface and write an event
# for each one to a file, which can be read back in the main test proc.
if callable(attr):
def wrapper(*args, **kwargs):
self.call_record[name] += 1
# Include args that we're interested in
to_log = [name]
for arg in args:
if isinstance(arg, int):
to_log.append(str(arg))
elif isinstance(arg, KVCacheBlocks):
to_log.append(
f"num_blocks={[len(b) for b in arg.blocks]}")
# Log the event as a line to the file
try:
with open(self._event_file, "a") as f:
f.write(' '.join(to_log) + "\n")
except Exception as e:
print(f"[ERROR] Could not log event {name} "
f"for {self.name}: {e}")
return attr(*args, **kwargs)
return wrapper
return attr
KVConnectorFactory.register_connector("TestSharedStorageConnector",
TestSharedStorageConnector.__module__,
TestSharedStorageConnector.__name__)
# Helper function to compare directories recursively # Helper function to compare directories recursively
def _compare_directories(dir1: Path, dir2: Path) -> bool: def _compare_directories(dir1: Path, dir2: Path) -> bool:
"""Compares two directories recursively for identical content.""" """Compares two directories recursively for identical content."""
...@@ -115,19 +53,27 @@ def test_multi_shared_storage_connector_consistency(): ...@@ -115,19 +53,27 @@ def test_multi_shared_storage_connector_consistency():
kv_role="kv_both", kv_role="kv_both",
kv_connector_extra_config={ kv_connector_extra_config={
"connectors": [{ "connectors": [{
"kv_connector": "TestSharedStorageConnector", "kv_connector":
"kv_role": "kv_both", "TestSharedStorageConnector",
"kv_role":
"kv_both",
"kv_connector_extra_config": { "kv_connector_extra_config": {
"shared_storage_path": str(storage_1_path), "shared_storage_path": str(storage_1_path),
"name": "storage1", "name": "storage1",
} },
"kv_connector_module_path":
"tests.v1.kv_connector.unit.utils",
}, { }, {
"kv_connector": "TestSharedStorageConnector", "kv_connector":
"kv_role": "kv_both", "TestSharedStorageConnector",
"kv_role":
"kv_both",
"kv_connector_extra_config": { "kv_connector_extra_config": {
"shared_storage_path": str(storage_2_path), "shared_storage_path": str(storage_2_path),
"name": "storage2", "name": "storage2",
} },
"kv_connector_module_path":
"tests.v1.kv_connector.unit.utils",
}] }]
}, },
) )
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import tempfile
import textwrap
import time import time
import uuid
from collections import defaultdict
from typing import Optional
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
import ray
from vllm import LLM
from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata, KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
NixlConnectorWorker) NixlConnectorWorker)
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.mocks.mock_nixl_connector import FakeNixlWrapper
from vllm.sampling_params import SamplingParams
from .utils import create_request, create_scheduler, create_vllm_config from .utils import create_request, create_scheduler, create_vllm_config
def _make_stub_pkg() -> str:
"""Return a directory that makes
`from nixl._api import nixl_agent` resolve to our FakeNixlWrapper."""
td = tempfile.mkdtemp()
pkg_root = os.path.join(td, "nixl", "_api")
os.makedirs(pkg_root, exist_ok=True)
stub = textwrap.dedent("""\
# Forward the real FakeNixlWrapper that the driver already defined.
print("In fake package")
from vllm.mocks.mock_nixl_connector import FakeNixlWrapper as nixl_agent
""")
with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
f.write(stub)
# touch parent package
open(os.path.join(td, "nixl", "__init__.py"), "w").close()
return td
def test_basic_interface(): def test_basic_interface():
"""Unit test for basic NixlConnector interface functionality.""" """Unit test for basic NixlConnector interface functionality."""
...@@ -41,9 +66,9 @@ def test_basic_interface(): ...@@ -41,9 +66,9 @@ def test_basic_interface():
assert kv_connector_metadata is not None assert kv_connector_metadata is not None
assert isinstance(kv_connector_metadata, NixlConnectorMetadata) assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
assert len(kv_connector_metadata.requests) == 1 assert len(kv_connector_metadata.reqs_to_recv) == 1
assert request_id in kv_connector_metadata.requests assert request_id in kv_connector_metadata.reqs_to_recv
req_meta = kv_connector_metadata.requests[request_id] req_meta = kv_connector_metadata.reqs_to_recv[request_id]
for block_id, block in zip( for block_id, block in zip(
req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator. req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
...@@ -78,83 +103,12 @@ def test_prompt_less_than_block_size(): ...@@ -78,83 +103,12 @@ def test_prompt_less_than_block_size():
kv_connector_metadata = scheduler_output.kv_connector_metadata kv_connector_metadata = scheduler_output.kv_connector_metadata
assert kv_connector_metadata is not None assert kv_connector_metadata is not None
assert isinstance(kv_connector_metadata, NixlConnectorMetadata) assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
assert len(kv_connector_metadata.requests) == 0 assert len(kv_connector_metadata.reqs_to_recv) == 0
# This request should be scheduled regularly. # This request should be scheduled regularly.
assert len(scheduler_output.scheduled_new_reqs) == 1 assert len(scheduler_output.scheduled_new_reqs) == 1
class FakeNixlWrapper:
"""Mock implementation of NixlWrapper for testing.
We don't inherit from nixl._api.nixl_agent because nixl may not be
installed.
"""
AGENT_METADATA = b"fake_agent_metadata"
REMOTE_AGENT_NAME = "remote_agent"
def __init__(self, agent_name: str, *args, **kwargs):
self._cycles_before_xfer_done = 0
self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict(
lambda: 0)
def get_reg_descs(self, caches_data, memory_type: str) -> list:
return [str(uuid.uuid4()) for _ in caches_data]
def register_memory(self, descs) -> None:
pass
def get_xfer_descs(self, blocks_data, memory_type: str) -> list:
return [str(uuid.uuid4()) for _ in blocks_data]
def prep_xfer_dlist(self, agent_name: str, descs: list) -> int:
return uuid.uuid4().int
def get_agent_metadata(self) -> bytes:
return self.AGENT_METADATA
def add_remote_agent(self, agent_metadata: bytes) -> str:
return self.REMOTE_AGENT_NAME
def get_new_notifs(self) -> dict[str, list[bytes]]:
# Used to collect done_sending, which we don't test yet.
return {}
def check_xfer_state(self, handle: int) -> str:
if self._check_xfer_state_cycles[
handle] >= self._cycles_before_xfer_done:
return "DONE"
self._check_xfer_state_cycles[handle] += 1
return "PROC"
def release_xfer_handle(self, handle: int) -> None:
pass
def send_notif(self, agent_name: str, notif_msg: bytes) -> None:
pass
def make_prepped_xfer(self,
xfer_type: str,
local_xfer_side_handle: int,
local_block_descs_ids: list[int],
remote_xfer_side_handle: int,
remote_block_descs_ids: list[int],
notif_msg: Optional[bytes] = None) -> int:
return uuid.uuid4().int
def transfer(self, handle: int) -> str:
return "PROC"
############################################################
# Follow are for changing the behavior during testing.
############################################################
def set_cycles_before_xfer_done(self, cycles: int):
"""Set the number of cycles before a transfer is considered done."""
self._cycles_before_xfer_done = cycles
class FakeNixlConnectorWorker(NixlConnectorWorker): class FakeNixlConnectorWorker(NixlConnectorWorker):
REMOTE_ENGINE_ID = "remote_engine" REMOTE_ENGINE_ID = "remote_engine"
...@@ -163,8 +117,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): ...@@ -163,8 +117,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self._hand_shake_latency = hand_shake_latency self._hand_shake_latency = hand_shake_latency
def _nixl_handshake(self, host: str, port: int, def _nixl_handshake(self, host: str, port: int, remote_tp_size: int,
remote_tp_size: int) -> dict[int, str]: expected_engine_id: str) -> dict[int, str]:
# Mimic slow _nixl_handshake, as well as bypass zmq communication. # Mimic slow _nixl_handshake, as well as bypass zmq communication.
time.sleep(self._hand_shake_latency) time.sleep(self._hand_shake_latency)
# These should've been done in register_kv_caches(), called by # These should've been done in register_kv_caches(), called by
...@@ -174,6 +128,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): ...@@ -174,6 +128,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
self.num_blocks = 1 self.num_blocks = 1
self.dst_num_blocks[self.engine_id] = self.num_blocks self.dst_num_blocks[self.engine_id] = self.num_blocks
assert expected_engine_id == self.REMOTE_ENGINE_ID
remote_agent_name = self.add_remote_agent( remote_agent_name = self.add_remote_agent(
NixlAgentMetadata( NixlAgentMetadata(
engine_id=self.REMOTE_ENGINE_ID, engine_id=self.REMOTE_ENGINE_ID,
...@@ -371,3 +327,86 @@ class TestNixlHandshake: ...@@ -371,3 +327,86 @@ class TestNixlHandshake:
if cnt_finished_reqs == total_reqs: if cnt_finished_reqs == total_reqs:
return return
raise TimeoutError("Took too long to complete async handshake.") raise TimeoutError("Took too long to complete async handshake.")
# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
# we put here is important. First run ray, it will clean up the resources, then
# the rest of the tests.
@pytest.mark.parametrize("distributed_executor_backend", ["ray", None])
@patch(
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
FakeNixlWrapper)
def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
"""
Test lifecycle of an aborted Remote Prefill request hitting the timeout.
-----> P
| {process request}
<-/--- | {result is NOT delivered, eg proxy is down}
|
|
| {eventually free blocks}
"""
model_name = "Qwen/Qwen3-0.6B"
kv_transfer_config = KVTransferConfig(
kv_connector="NixlConnector",
kv_role="kv_both",
)
timeout = 6
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", str(timeout))
# Build runtime_env only if we’re using Ray
if distributed_executor_backend == "ray":
runtime_env = {
"working_dir": _make_stub_pkg(), # ship stub package
"env_vars": {
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout),
},
}
ray.init(runtime_env=runtime_env)
llm = LLM(
model=model_name,
enforce_eager=True,
gpu_memory_utilization=0.5,
kv_transfer_config=kv_transfer_config,
distributed_executor_backend=distributed_executor_backend,
)
remote_prefill_opts = {
"do_remote_decode": True,
"do_remote_prefill": False,
"remote_engine_id": None,
"remote_block_ids": None,
"remote_host": None,
"remote_port": None,
}
# Simulate sidecar request
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=1,
extra_args={"kv_transfer_params": remote_prefill_opts})
scheduler = llm.llm_engine.engine_core.engine_core.scheduler
req_to_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
0].req_to_blocks
padding = "Just making this request a little longer so that we're sure "
"we're not hitting the small-request lower bound beneath which we don't "
"actually trigger the whole kv transfer, but rather just recompute the "
"blocks on D."
_ = llm.generate([f"What is the capital of Japan? {padding}"],
sampling_params)
# Request finished but not freed
assert '0' in scheduler.finished_req_ids and '0' in req_to_blocks
# Some other request, 0 still not freed
_ = llm.generate([f"What is the capital of Italy? {padding}"],
sampling_params)
assert '0' in req_to_blocks
assert '1' in scheduler.finished_req_ids and '1' in req_to_blocks
# Wait for timeout and trigger another scheduler loop
time.sleep(timeout)
_ = llm.generate([f"What is the capital of France? {padding}"],
sampling_params)
# Request-0 times out and is cleared!
assert '0' not in req_to_blocks
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from concurrent.futures import Future
from typing import Optional
from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
from vllm.v1.outputs import ModelRunnerOutput
class DummyModelRunnerOutput(ModelRunnerOutput):
def __init__(self,
finished_sending: Optional[set[str]] = None,
finished_recving: Optional[set[str]] = None):
self.finished_sending = finished_sending
self.finished_recving = finished_recving
def test_aggregate_workers_output():
aggregator = KVOutputAggregator(world_size=2)
output1 = DummyModelRunnerOutput(finished_sending={'req1'},
finished_recving={'req2'})
output2 = DummyModelRunnerOutput(finished_sending=None,
finished_recving=None)
aggregated = aggregator.aggregate([output1, output2])
assert aggregated is output1
assert aggregated.finished_sending is None
assert aggregated.finished_recving is None
output1 = DummyModelRunnerOutput(finished_sending=None,
finished_recving=None)
output2 = DummyModelRunnerOutput(finished_sending={'req1'},
finished_recving=None)
aggregated = aggregator.aggregate([output1, output2])
assert aggregated is output1
assert aggregated.finished_sending == {'req1'}
assert aggregated.finished_recving is None
output1 = DummyModelRunnerOutput(finished_sending=None,
finished_recving=None)
output2 = DummyModelRunnerOutput(finished_sending={'req1'},
finished_recving={'req2'})
aggregated = aggregator.aggregate([output1, output2])
assert aggregated is output1
assert aggregated.finished_sending is None
assert aggregated.finished_recving == {'req2'}
def test_async_aggregate_workers_output():
aggregator = KVOutputAggregator(world_size=2)
future1: Future[DummyModelRunnerOutput] = Future()
future2: Future[DummyModelRunnerOutput] = Future()
result_future = aggregator.async_aggregate([future1, future2])
output1 = DummyModelRunnerOutput(finished_sending={'req1'},
finished_recving={'req2'})
output2 = DummyModelRunnerOutput(finished_sending=None,
finished_recving=None)
future1.set_result(output1)
future2.set_result(output2)
assert result_future.done()
aggregated = result_future.result()
assert aggregated is output1
assert aggregated.finished_sending is None
assert aggregated.finished_recving is None
future1 = Future()
future2 = Future()
result_future = aggregator.async_aggregate([future1, future2])
output1 = DummyModelRunnerOutput(finished_sending=None,
finished_recving=None)
output2 = DummyModelRunnerOutput(finished_sending={'req1'},
finished_recving=None)
future1.set_result(output1)
future2.set_result(output2)
assert result_future.done()
aggregated = result_future.result()
assert aggregated is output1
assert aggregated.finished_sending == {'req1'}
assert aggregated.finished_recving is None
future1 = Future()
future2 = Future()
result_future = aggregator.async_aggregate([future1, future2])
output1 = DummyModelRunnerOutput(finished_sending=None,
finished_recving=None)
output2 = DummyModelRunnerOutput(finished_sending={'req1'},
finished_recving={'req2'})
future1.set_result(output1)
future2.set_result(output2)
assert result_future.done()
aggregated = result_future.result()
assert aggregated is output1
assert aggregated.finished_sending is None
assert aggregated.finished_recving == {'req2'}
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from collections import defaultdict
from typing import Any, Optional from typing import Any, Optional
import torch import torch
...@@ -7,6 +9,11 @@ import torch ...@@ -7,6 +9,11 @@ import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig, from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
ModelConfig, SchedulerConfig, VllmConfig) ModelConfig, SchedulerConfig, VllmConfig)
from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory)
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa
SharedStorageConnector)
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec) KVCacheGroupSpec)
...@@ -187,3 +194,58 @@ def create_model_runner_output( ...@@ -187,3 +194,58 @@ def create_model_runner_output(
finished_sending=finished_sending, finished_sending=finished_sending,
finished_recving=finished_recving, finished_recving=finished_recving,
) )
class TestSharedStorageConnector(SharedStorageConnector):
def __init__(self, config: VllmConfig, role):
self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
self._connector = SharedStorageConnector(config, role)
self.call_record: dict[str, int] = defaultdict(int)
# Use a unique temp file per connector
self._event_file = tempfile.gettempdir(
) + f"/connector_{self.name}-{self.role.name}_events.log"
# Start with an empty file
with open(self._event_file, "w") as _:
pass
def __getattribute__(self, name):
if name in ("_connector", "call_record", "name", "_event_file",
"__class__", "__dict__", "__getattribute__",
"__init__"): # avoid recursion
return object.__getattribute__(self, name)
if not hasattr(self._connector, name):
return object.__getattribute__(self, name)
attr = getattr(self._connector, name)
# Intercept calls to the connector interface and write an event
# for each one to a file, which can be read back in the main test proc.
if callable(attr):
def wrapper(*args, **kwargs):
self.call_record[name] += 1
# Include args that we're interested in
to_log = [name]
for arg in args:
if isinstance(arg, int):
to_log.append(str(arg))
elif isinstance(arg, KVCacheBlocks):
to_log.append(
f"num_blocks={[len(b) for b in arg.blocks]}")
# Log the event as a line to the file
try:
with open(self._event_file, "a") as f:
f.write(' '.join(to_log) + "\n")
except Exception as e:
print(f"[ERROR] Could not log event {name} "
f"for {self.name}: {e}")
return attr(*args, **kwargs)
return wrapper
return attr
KVConnectorFactory.register_connector("TestSharedStorageConnector", __name__,
TestSharedStorageConnector.__name__)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
import ray import ray
from vllm.config import ModelDType
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
...@@ -27,7 +30,7 @@ MODELS = [ ...@@ -27,7 +30,7 @@ MODELS = [
def test_engine_log_metrics_ray( def test_engine_log_metrics_ray(
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: ModelDType,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
""" Simple smoke test, verifying this can be used without exceptions. """ Simple smoke test, verifying this can be used without exceptions.
...@@ -37,11 +40,14 @@ def test_engine_log_metrics_ray( ...@@ -37,11 +40,14 @@ def test_engine_log_metrics_ray(
class EngineTestActor: class EngineTestActor:
async def run(self): async def run(self):
engine_args = AsyncEngineArgs( # Set environment variable inside the Ray actor since environment
model=model, # variables from pytest fixtures don't propagate to Ray actors
dtype=dtype, os.environ['VLLM_USE_V1'] = '1'
disable_log_stats=False,
) engine_args = AsyncEngineArgs(model=model,
dtype=dtype,
disable_log_stats=False,
enforce_eager=True)
engine = AsyncLLM.from_engine_args( engine = AsyncLLM.from_engine_args(
engine_args, stat_loggers=[RayPrometheusStatLogger]) engine_args, stat_loggers=[RayPrometheusStatLogger])
......
...@@ -13,6 +13,7 @@ from tests.v1.sample.utils import ( ...@@ -13,6 +13,7 @@ from tests.v1.sample.utils import (
assert_incr_detok_str_matches_non_incr_detok_str, assert_incr_detok_str_matches_non_incr_detok_str,
compute_correct_cumulative_logprob, get_test_batch) compute_correct_cumulative_logprob, get_test_batch)
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import LogprobsMode
from ...conftest import HfRunner, VllmRunner from ...conftest import HfRunner, VllmRunner
from ...utils import models_path_prefix from ...utils import models_path_prefix
...@@ -114,7 +115,7 @@ def _run_and_validate( ...@@ -114,7 +115,7 @@ def _run_and_validate(
max_tokens: int, max_tokens: int,
do_apc: bool, do_apc: bool,
) -> None: ) -> None:
vllm_results = vllm_model.model.generate( vllm_results = vllm_model.llm.generate(
test_prompts, sampling_params=vllm_sampling_params) test_prompts, sampling_params=vllm_sampling_params)
for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip( for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
...@@ -290,7 +291,7 @@ def test_get_logprobs_and_prompt_logprobs( ...@@ -290,7 +291,7 @@ def test_get_logprobs_and_prompt_logprobs(
""" """
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
if do_apc and (temperature < 2.0 if do_apc and (temperature < 2.0
or batch_logprobs_composition != SAMPLE_PROMPT): or batch_logprobs_composition != SAMPLE_PROMPT):
# Skip some test-cases to save time. # Skip some test-cases to save time.
...@@ -380,7 +381,7 @@ def test_none_logprobs(vllm_model, example_prompts, ...@@ -380,7 +381,7 @@ def test_none_logprobs(vllm_model, example_prompts,
prompt_logprobs=None, prompt_logprobs=None,
temperature=0.0, temperature=0.0,
) )
results_logprobs_none = vllm_model.model.generate( results_logprobs_none = vllm_model.llm.generate(
example_prompts, example_prompts,
sampling_params=sampling_params_logprobs_none, sampling_params=sampling_params_logprobs_none,
) )
...@@ -410,7 +411,7 @@ def test_zero_logprobs(vllm_model, example_prompts, ...@@ -410,7 +411,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
logprobs=0, logprobs=0,
prompt_logprobs=0, prompt_logprobs=0,
temperature=0.0) temperature=0.0)
results_logprobs_zero = vllm_model.model.generate( results_logprobs_zero = vllm_model.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero) example_prompts, sampling_params=sampling_params_logprobs_zero)
for i in range(len(results_logprobs_zero)): for i in range(len(results_logprobs_zero)):
...@@ -428,3 +429,45 @@ def test_zero_logprobs(vllm_model, example_prompts, ...@@ -428,3 +429,45 @@ def test_zero_logprobs(vllm_model, example_prompts,
# prompt token # prompt token
assert prompt_logprobs is not None assert prompt_logprobs is not None
assert len(prompt_token_ids) == len(prompt_logprobs) assert len(prompt_token_ids) == len(prompt_logprobs)
@pytest.mark.parametrize(
"logprobs_mode",
["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"])
def test_logprobs_mode(logprobs_mode: LogprobsMode,
monkeypatch: pytest.MonkeyPatch):
"""Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values.
"""
from vllm import LLM
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
llm = LLM(
"facebook/opt-125m",
max_logprobs=5,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.05,
max_model_len=16,
logprobs_mode=logprobs_mode)
vllm_sampling_params = SamplingParams(logprobs=1)
results = llm.generate(["Hello world"],
sampling_params=vllm_sampling_params)
total_token_with_logprobs = 0
positive_values = 0
for output in results[0].outputs:
for logprobs in output.logprobs:
for token_id in logprobs:
logprob = logprobs[token_id]
if "logprobs" in logprobs_mode:
assert logprob.logprob <= 0
if logprob.logprob > 0:
positive_values = positive_values + 1
total_token_with_logprobs = total_token_with_logprobs + 1
assert total_token_with_logprobs >= len(results[0].outputs)
if "logits" in logprobs_mode:
assert positive_values > 0
del llm
...@@ -15,30 +15,30 @@ PROMPT = "Hello my name is Robert and I" ...@@ -15,30 +15,30 @@ PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def model() -> LLM: def llm() -> LLM:
# Disable prefix caching so that we can test prompt logprobs. # Disable prefix caching so that we can test prompt logprobs.
# TODO remove this after https://github.com/vllm-project/vllm/pull/13949 # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
# is merged # is merged
return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False) return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
def test_n_gt_1(model): def test_n_gt_1(llm):
"""ParallelSampling is supported.""" """ParallelSampling is supported."""
params = SamplingParams(n=3) params = SamplingParams(n=3)
outputs = model.generate(PROMPT, params) outputs = llm.generate(PROMPT, params)
assert len(outputs[0].outputs) == 3 assert len(outputs[0].outputs) == 3
def test_best_of(model): def test_best_of(llm):
"""Raise a ValueError since best_of is deprecated.""" """Raise a ValueError since best_of is deprecated."""
params = SamplingParams(n=2, best_of=3) params = SamplingParams(n=2, best_of=3)
with pytest.raises(ValueError): with pytest.raises(ValueError):
_ = model.generate(PROMPT, params) _ = llm.generate(PROMPT, params)
def test_penalties(model): def test_penalties(llm):
"""Check that we do not get errors if applied.""" """Check that we do not get errors if applied."""
params = SamplingParams( params = SamplingParams(
...@@ -50,18 +50,18 @@ def test_penalties(model): ...@@ -50,18 +50,18 @@ def test_penalties(model):
top_p=0.5, top_p=0.5,
top_k=3, top_k=3,
) )
_ = model.generate(PROMPT, params) _ = llm.generate(PROMPT, params)
def test_stop(model): def test_stop(llm):
"""Check that we respect the stop words.""" """Check that we respect the stop words."""
output = model.generate(PROMPT, SamplingParams(temperature=0)) output = llm.generate(PROMPT, SamplingParams(temperature=0))
split_text = output[0].outputs[0].text.split() split_text = output[0].outputs[0].text.split()
STOP_IDX = 5 STOP_IDX = 5
params = SamplingParams(temperature=0, stop=split_text[STOP_IDX]) params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
output = model.generate(PROMPT, params) output = llm.generate(PROMPT, params)
new_split_text = output[0].outputs[0].text.split() new_split_text = output[0].outputs[0].text.split()
# Output should not contain the stop word. # Output should not contain the stop word.
...@@ -70,40 +70,40 @@ def test_stop(model): ...@@ -70,40 +70,40 @@ def test_stop(model):
params = SamplingParams(temperature=0, params = SamplingParams(temperature=0,
stop=split_text[STOP_IDX], stop=split_text[STOP_IDX],
include_stop_str_in_output=True) include_stop_str_in_output=True)
output = model.generate(PROMPT, params) output = llm.generate(PROMPT, params)
new_split_text = output[0].outputs[0].text.split() new_split_text = output[0].outputs[0].text.split()
# Output should contain the stop word. # Output should contain the stop word.
assert len(new_split_text) == STOP_IDX + 1 assert len(new_split_text) == STOP_IDX + 1
def test_stop_token_ids(model): def test_stop_token_ids(llm):
"""Check that we respect the stop token ids.""" """Check that we respect the stop token ids."""
output = model.generate(PROMPT, SamplingParams(temperature=0)) output = llm.generate(PROMPT, SamplingParams(temperature=0))
stop_token_id_0 = output[0].outputs[0].token_ids[5] stop_token_id_0 = output[0].outputs[0].token_ids[5]
stop_token_id_1 = output[0].outputs[0].token_ids[6] stop_token_id_1 = output[0].outputs[0].token_ids[6]
stop_token_ids = [stop_token_id_1, stop_token_id_0] stop_token_ids = [stop_token_id_1, stop_token_id_0]
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids) params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
output = model.generate(PROMPT, params) output = llm.generate(PROMPT, params)
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0 assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
stop_token_ids = [stop_token_id_0, stop_token_id_1] stop_token_ids = [stop_token_id_0, stop_token_id_1]
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids) params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
output = model.generate(PROMPT, params) output = llm.generate(PROMPT, params)
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0 assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
def test_detokenize_false(model): def test_detokenize_false(llm):
"""Check that detokenize=False option works.""" """Check that detokenize=False option works."""
output = model.generate(PROMPT, SamplingParams(detokenize=False)) output = llm.generate(PROMPT, SamplingParams(detokenize=False))
assert len(output[0].outputs[0].token_ids) > 0 assert len(output[0].outputs[0].token_ids) > 0
assert len(output[0].outputs[0].text) == 0 assert len(output[0].outputs[0].text) == 0
output = model.generate( output = llm.generate(
PROMPT, SamplingParams(detokenize=False, logprobs=3, PROMPT, SamplingParams(detokenize=False, logprobs=3,
prompt_logprobs=3)) prompt_logprobs=3))
assert len(output[0].outputs[0].token_ids) > 0 assert len(output[0].outputs[0].token_ids) > 0
...@@ -119,28 +119,28 @@ def test_detokenize_false(model): ...@@ -119,28 +119,28 @@ def test_detokenize_false(model):
assert all(lp.decoded_token is None for lp in logprobs.values()) assert all(lp.decoded_token is None for lp in logprobs.values())
def test_bad_words(model): def test_bad_words(llm):
"""Check that we respect bad words.""" """Check that we respect bad words."""
output = model.generate(PROMPT, SamplingParams(temperature=0)) output = llm.generate(PROMPT, SamplingParams(temperature=0))
split_text = output[0].outputs[0].text.split() split_text = output[0].outputs[0].text.split()
bad_words_1 = " ".join(split_text[:2]) bad_words_1 = " ".join(split_text[:2])
params = SamplingParams(temperature=0, bad_words=[bad_words_1]) params = SamplingParams(temperature=0, bad_words=[bad_words_1])
output = model.generate(PROMPT, params) output = llm.generate(PROMPT, params)
new_text = output[0].outputs[0].text new_text = output[0].outputs[0].text
assert bad_words_1 not in new_text assert bad_words_1 not in new_text
bad_words_2 = new_text.split()[-1] bad_words_2 = new_text.split()[-1]
params = SamplingParams(temperature=0, params = SamplingParams(temperature=0,
bad_words=[bad_words_1, bad_words_2]) bad_words=[bad_words_1, bad_words_2])
output = model.generate(PROMPT, params) output = llm.generate(PROMPT, params)
new_text = output[0].outputs[0].text new_text = output[0].outputs[0].text
assert bad_words_1 not in new_text assert bad_words_1 not in new_text
assert bad_words_2 not in new_text assert bad_words_2 not in new_text
def test_logits_processor(model): def test_logits_processor(llm):
"""Check that we reject logits processor.""" """Check that we reject logits processor."""
# This sample logits processor gives infinite score to the i-th token, # This sample logits processor gives infinite score to the i-th token,
...@@ -151,47 +151,45 @@ def test_logits_processor(model): ...@@ -151,47 +151,45 @@ def test_logits_processor(model):
return logits return logits
with pytest.raises(ValueError): with pytest.raises(ValueError):
_ = model.generate(PROMPT, _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
SamplingParams(logits_processors=[pick_ith]))
def test_allowed_token_ids(model): def test_allowed_token_ids(llm):
"""Check that we can use allowed_token_ids.""" """Check that we can use allowed_token_ids."""
TOKEN_ID = 10 TOKEN_ID = 10
allowed_token_ids = [TOKEN_ID] allowed_token_ids = [TOKEN_ID]
output = model.generate( output = llm.generate(PROMPT,
PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids)) SamplingParams(allowed_token_ids=allowed_token_ids))
assert output[0].outputs[0].token_ids[-1] == TOKEN_ID assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
# Reject empty allowed_token_ids. # Reject empty allowed_token_ids.
with pytest.raises(ValueError): with pytest.raises(ValueError):
_ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[])) _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
# Reject negative token id. # Reject negative token id.
with pytest.raises(ValueError): with pytest.raises(ValueError):
_ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1])) _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
# Reject out of vocabulary. # Reject out of vocabulary.
with pytest.raises(ValueError): with pytest.raises(ValueError):
_ = model.generate(PROMPT, _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
SamplingParams(allowed_token_ids=[10000000]))
def test_priority(model): def test_priority(llm):
"""Check that we reject requests with priority.""" """Check that we reject requests with priority."""
# Reject all allowed token ids # Reject all allowed token ids
with pytest.raises(ValueError): with pytest.raises(ValueError):
_ = model.generate(PROMPT, priority=[1]) _ = llm.generate(PROMPT, priority=[1])
def test_seed(model): def test_seed(llm):
"""Check that seed impacts randomness.""" """Check that seed impacts randomness."""
out_1 = model.generate(PROMPT, SamplingParams(seed=42)) out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
out_2 = model.generate(PROMPT, SamplingParams(seed=42)) out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
out_3 = model.generate(PROMPT, SamplingParams(seed=43)) out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
assert out_1[0].outputs[0].text == out_2[0].outputs[0].text assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
assert out_1[0].outputs[0].text != out_3[0].outputs[0].text assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
...@@ -6,6 +6,10 @@ from unittest import mock ...@@ -6,6 +6,10 @@ from unittest import mock
import pytest import pytest
import torch import torch
from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata,
create_standard_kv_cache_spec,
get_attention_backend)
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig,
VllmConfig) VllmConfig)
...@@ -64,13 +68,19 @@ def test_prepare_inputs(): ...@@ -64,13 +68,19 @@ def test_prepare_inputs():
""" """
device = torch.device(current_platform.device_type) device = torch.device(current_platform.device_type)
# a = 4, b = 7, c = 5 # q1 = 4, q2 = 7, q3 = 5
# n1 = 1, n2 = 3, n3 = 2 # n1 = 1, n2 = 3, n3 = 2
# Cumulative lengths: [0, 4, 11, 16] batch_spec = BatchSpec(
cu_target_query_lens = torch.tensor([0, 4, 11, 16], seq_lens=[4, 7, 5],
dtype=torch.int32, query_lens=[4, 7, 5],
device=device) )
common_attn_metadata = create_common_attn_metadata(
batch_spec,
block_size=16,
device=device,
)
# Rejected tokens per request: [1, 3, 2] # Rejected tokens per request: [1, 3, 2]
num_rejected_tokens = torch.tensor([1, 3, 2], num_rejected_tokens = torch.tensor([1, 3, 2],
...@@ -104,15 +114,13 @@ def test_prepare_inputs(): ...@@ -104,15 +114,13 @@ def test_prepare_inputs():
], ],
dtype=torch.int32, dtype=torch.int32,
device=device) device=device)
proposer = _create_proposer("eagle", 1)
# n1 + n2 + n3 - a - b -c updated_metadata, token_indices = proposer.prepare_inputs(
num_tokens = cu_target_query_lens[-1].item() - num_rejected_tokens.sum( common_attn_metadata, num_rejected_tokens.cpu())
).item()
cu_num_tokens, token_indices = EagleProposer.prepare_inputs( assert torch.equal(updated_metadata.query_start_loc,
cu_target_query_lens, num_rejected_tokens, num_tokens) expected_cu_num_tokens)
assert torch.equal(cu_num_tokens, expected_cu_num_tokens)
assert token_indices.shape[0] == expected_cu_num_tokens[-1].item() assert token_indices.shape[0] == expected_cu_num_tokens[-1].item()
assert torch.equal(token_indices, expected_token_indices) assert torch.equal(token_indices, expected_token_indices)
...@@ -209,6 +217,7 @@ def test_propose(num_speculative_tokens): ...@@ -209,6 +217,7 @@ def test_propose(num_speculative_tokens):
seq_len_2 = 3 seq_len_2 = 3
total_tokens = seq_len_1 + seq_len_2 total_tokens = seq_len_1 + seq_len_2
vocab_size = 100 vocab_size = 100
seq_lens = [seq_len_1, seq_len_2]
# Create proposer first so we can use its actual hidden_size # Create proposer first so we can use its actual hidden_size
proposer = _create_proposer("eagle", num_speculative_tokens) proposer = _create_proposer("eagle", num_speculative_tokens)
...@@ -270,9 +279,16 @@ def test_propose(num_speculative_tokens): ...@@ -270,9 +279,16 @@ def test_propose(num_speculative_tokens):
proposer.attn_layer_names = ["layer.0"] proposer.attn_layer_names = ["layer.0"]
# Create input tensors # Create input tensors
cu_num_tokens = torch.tensor([0, seq_len_1, total_tokens], batch_spec = BatchSpec(
dtype=torch.int32, seq_lens=seq_lens,
device=device) query_lens=seq_lens,
)
common_attn_metadata = create_common_attn_metadata(
batch_spec,
block_size=16,
device=device,
)
target_token_ids = torch.randint(0, target_token_ids = torch.randint(0,
vocab_size, (total_tokens, ), vocab_size, (total_tokens, ),
...@@ -284,25 +300,29 @@ def test_propose(num_speculative_tokens): ...@@ -284,25 +300,29 @@ def test_propose(num_speculative_tokens):
target_hidden_states = torch.randn(total_tokens, target_hidden_states = torch.randn(total_tokens,
hidden_size, hidden_size,
device=device) device=device)
target_slot_mapping = torch.randint(0,
100, (total_tokens, ),
device=device)
next_token_ids = torch.randint(0, next_token_ids = torch.randint(0,
vocab_size, (batch_size, ), vocab_size, (batch_size, ),
dtype=torch.int32, dtype=torch.int32,
device=device) device=device)
block_table = torch.randint(0, 10, (batch_size, 10), device=device)
sampling_metadata = mock.MagicMock() sampling_metadata = mock.MagicMock()
# Call the method under test attn_metadata_builder_cls, _ = get_attention_backend(
_Backend.FLASH_ATTN_VLLM_V1)
attn_metadata_builder = attn_metadata_builder_cls(
kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
vllm_config=proposer.vllm_config,
device=device,
)
# Mock runner for attention metadata building
proposer.runner = mock.MagicMock()
proposer.runner.attn_metadata_builders = [attn_metadata_builder]
result = proposer.propose(target_token_ids=target_token_ids, result = proposer.propose(target_token_ids=target_token_ids,
target_positions=target_positions, target_positions=target_positions,
target_hidden_states=target_hidden_states, target_hidden_states=target_hidden_states,
target_slot_mapping=target_slot_mapping,
next_token_ids=next_token_ids, next_token_ids=next_token_ids,
cu_num_tokens=cu_num_tokens, common_attn_metadata=common_attn_metadata,
block_table=block_table,
sampling_metadata=sampling_metadata) sampling_metadata=sampling_metadata)
assert result.shape == (batch_size, num_speculative_tokens) assert result.shape == (batch_size, num_speculative_tokens)
......
...@@ -93,8 +93,10 @@ async def test_load(output_kind: RequestOutputKind, ...@@ -93,8 +93,10 @@ async def test_load(output_kind: RequestOutputKind,
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
stats_loggers[engine_index] = self stats_loggers[engine_index] = self
def record(self, scheduler_stats: Optional[SchedulerStats], def record(self,
iteration_stats: Optional[IterationStats]): scheduler_stats: Optional[SchedulerStats],
iteration_stats: Optional[IterationStats],
engine_idx: int = 0):
if iteration_stats: if iteration_stats:
self.finished_req_count += len( self.finished_req_count += len(
iteration_stats.finished_requests) iteration_stats.finished_requests)
......
...@@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b" ...@@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for external LB testing # Number of data parallel ranks for external LB testing
DP_SIZE = int(os.getenv("DP_SIZE", "2")) DP_SIZE = int(os.getenv("DP_SIZE", "2"))
# Default tensor parallell size to use # Default tensor parallel size to use
TP_SIZE = int(os.getenv("TP_SIZE", "1")) TP_SIZE = int(os.getenv("TP_SIZE", "1"))
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
import threading
import time
from contextlib import AsyncExitStack
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from tests.v1.test_utils import check_request_balancing
from vllm.platforms import Platform
MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for hybrid LB testing (4 total)
DP_SIZE = int(os.getenv("DP_SIZE", "4"))
# Default tensor parallel size to use
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
# Number of nodes (2 nodes, each with 2 DP ranks)
NUM_NODES = 2
DP_SIZE_LOCAL = DP_SIZE // NUM_NODES # 2 ranks per node
class HybridLBServerManager:
"""Manages hybrid data parallel vLLM server instances where each node
runs a single logical API server that balances requests only to the
DP engines running on that same node."""
def __init__(self,
model_name: str,
dp_size: int,
api_server_count: int,
base_server_args: list,
dp_size_local: int = DP_SIZE_LOCAL,
tp_size: int = TP_SIZE):
self.model_name = model_name
self.dp_size = dp_size
self.dp_size_local = dp_size_local
self.tp_size = tp_size
self.api_server_count = api_server_count
self.base_server_args = base_server_args
self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
self.server_threads: list[threading.Thread] = []
self.num_nodes = dp_size // dp_size_local
def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
"""Start all server instances for hybrid LB mode."""
for node_id in range(self.num_nodes):
# Create server args for this specific node
server_args = self.base_server_args.copy()
# Calculate start rank for this node
start_rank = node_id * self.dp_size_local
# Add hybrid LB specific arguments
server_args.extend([
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
str(self.dp_size_local),
"--data-parallel-start-rank",
str(start_rank),
"--data-parallel-hybrid-lb", # Enable hybrid LB mode
"--tensor-parallel-size",
str(self.tp_size),
"--port",
str(8000 + node_id), # Different port for each node
"--api-server-count",
str(self.api_server_count),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
# Use a thread to start each server to allow parallel initialization
def start_server(node: int, sargs: list[str]):
try:
# Calculate GPU devices for this node
gpus_per_node = self.dp_size_local * self.tp_size
gpu_start = node * gpus_per_node
gpu_end = gpu_start + gpus_per_node
# Start the server
server = RemoteOpenAIServer(
self.model_name,
sargs,
auto_port=False,
env_dict={
"CUDA_VISIBLE_DEVICES":
",".join(
str(Platform.device_id_to_physical_device_id(
i)) for i in range(gpu_start, gpu_end))
})
server.__enter__()
print(f"Hybrid LB node {node} started successfully with "
f"{self.dp_size_local} local DP ranks and "
f"{self.api_server_count} API servers")
self.servers.append((server, sargs))
except Exception as e:
print(f"Failed to start hybrid LB node {node}: {e}")
raise
thread = threading.Thread(target=start_server,
args=(node_id, server_args))
thread.start()
self.server_threads.append(thread)
# Wait for all servers to start
for thread in self.server_threads:
thread.join()
# Give servers additional time to fully initialize and coordinate
time.sleep(3)
if len(self.servers) != self.num_nodes:
raise Exception("Servers failed to start")
return self.servers
def __exit__(self, exc_type, exc_val, exc_tb):
"""Stop all server instances."""
while self.servers:
try:
self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
except Exception as e:
print(f"Error stopping server: {e}")
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
]
@pytest.fixture(scope="module", params=[1]) # Only 1 API server for now
def servers(request, default_server_args):
api_server_count = request.param
with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
default_server_args, DP_SIZE_LOCAL,
TP_SIZE) as server_list:
yield server_list
@pytest_asyncio.fixture
async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
# Create a client for each node (each node has its own API endpoint)
async with AsyncExitStack() as stack:
yield [
await stack.enter_async_context(server.get_async_client())
for server, _ in servers
]
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
servers: list[tuple[RemoteOpenAIServer,
list[str]]],
model_name: str) -> None:
async def make_request(client: openai.AsyncOpenAI):
completion = await client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=10,
temperature=1.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert len(choice.text) >= 1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert choice.finish_reason in ("length", "stop")
# Token counts can also vary, so we check they are positive.
assert completion.usage.completion_tokens > 0
assert completion.usage.prompt_tokens > 0
assert completion.usage.total_tokens > 0
return completion
# Test single request to each node
for i, client in enumerate(clients):
result = await make_request(client)
assert result is not None
print(
f"Hybrid LB node {i} handled single completion request successfully"
)
await asyncio.sleep(0.5)
# Send requests to all nodes - each should balance within its local DP ranks
num_requests_per_node = 25 # Total 50 requests across 2 nodes
all_tasks = []
for i, client in enumerate(clients):
tasks = [make_request(client) for _ in range(num_requests_per_node)]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_node * len(clients)
assert all(completion is not None for completion in results)
await asyncio.sleep(0.5)
# Second burst of requests
all_tasks = []
for i, client in enumerate(clients):
tasks = [make_request(client) for _ in range(num_requests_per_node)]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_node * len(clients)
assert all(completion is not None for completion in results)
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(
f"Successfully completed hybrid LB test with {len(clients)} nodes "
f"({DP_SIZE_LOCAL} DP ranks each, API server count: {api_server_count})"
)
# Check request balancing within each node
for i, (server, _) in enumerate(servers):
print(f"Checking request balancing for node {i}")
check_request_balancing(server, DP_SIZE_LOCAL)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_hybrid_lb_completion_streaming(clients: list[
openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
model_name: str) -> None:
prompt = "What is an LLM?"
async def make_streaming_request(client: openai.AsyncOpenAI):
# Perform a non-streaming request to get the expected full output
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
# Perform the streaming request
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: list[str] = []
finish_reason_count = 0
last_chunk = None
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
last_chunk = chunk # Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert finish_reason_count == 1, (
"Finish reason should appear exactly once.")
assert last_chunk is not None, (
"Stream should have yielded at least one chunk.")
assert last_chunk.choices[
0].finish_reason == "length", "Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert "".join(
chunks
) == single_output, "Streamed output should match non-streamed output."
return True # Indicate success for this request
# Test single request to each node
for i, client in enumerate(clients):
result = await make_streaming_request(client)
assert result is not None
print(
f"Hybrid LB node {i} handled single streaming request successfully"
)
await asyncio.sleep(0.5)
# Send streaming requests to all nodes
num_requests_per_node = 25 # Total 50 requests across 2 nodes
all_tasks = []
for i, client in enumerate(clients):
tasks = [
make_streaming_request(client)
for _ in range(num_requests_per_node)
]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_node * len(clients)
assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5)
# Second burst of streaming requests
all_tasks = []
for i, client in enumerate(clients):
tasks = [
make_streaming_request(client)
for _ in range(num_requests_per_node)
]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_node * len(clients)
assert all(results), "Not all streaming requests completed successfully."
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(f"Successfully completed hybrid LB streaming test with "
f"{len(clients)} nodes ({DP_SIZE_LOCAL} DP ranks each, "
f"API server count: {api_server_count})")
# Check request balancing within each node
for i, (server, _) in enumerate(servers):
print(f"Checking streaming request balancing for node {i}")
check_request_balancing(server, DP_SIZE_LOCAL)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
import threading
import time
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from tests.v1.test_utils import check_request_balancing
from vllm.platforms import Platform
MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for multi-node internal LB testing
DP_SIZE = int(os.getenv("DP_SIZE", "2"))
# Default tensor parallel size to use
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
# Number of nodes to simulate
NUM_NODES = 2
class MultinodeInternalLBServerManager:
"""Manages multi-node data parallel vLLM server instances for internal
load balancer testing using --headless mode."""
def __init__(self,
model_name: str,
dp_size: int,
api_server_count: int,
base_server_args: list,
dp_per_node: int = 1,
tp_size: int = TP_SIZE):
self.model_name = model_name
self.dp_size = dp_size
self.dp_per_node = dp_per_node
self.tp_size = tp_size
self.api_server_count = api_server_count
self.base_server_args = base_server_args
self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
self.server_threads: list[threading.Thread] = []
def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
"""Start all server instances for multi-node internal LB mode."""
for rank in range(0, self.dp_size, self.dp_per_node):
# Create server args for this specific rank
server_args = self.base_server_args.copy()
if rank == 0:
# Head node - runs API server and first DP rank
server_args.extend([
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
str(self.dp_per_node),
"--tensor-parallel-size",
str(self.tp_size),
"--port",
"8000", # Single endpoint for all requests
"--api-server-count",
str(self.api_server_count),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
else:
# Secondary nodes - run in headless mode
server_args.extend([
"--headless",
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
str(self.dp_per_node),
"--data-parallel-start-rank",
str(rank),
"--tensor-parallel-size",
str(self.tp_size),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
# Use a thread to start each server to allow parallel initialization
def start_server(r: int, sargs: list[str]):
gpus_per_node = self.tp_size * self.dp_per_node
try:
# Start the server
server = RemoteOpenAIServer(
self.model_name,
sargs,
auto_port=False,
env_dict={
"CUDA_VISIBLE_DEVICES":
",".join(
str(Platform.device_id_to_physical_device_id(
i)) for i in range(r, r + gpus_per_node))
})
server.__enter__()
if r == 0:
print(
f"Head node (rank {r}) started successfully with "
f"{self.api_server_count} API servers")
else:
print(f"Headless node (rank {r}) started successfully")
self.servers.append((server, sargs))
except Exception as e:
print(f"Failed to start server rank {r}: {e}")
raise
thread = threading.Thread(target=start_server,
args=(rank, server_args))
thread.start()
self.server_threads.append(thread)
# Wait for all servers to start
for thread in self.server_threads:
thread.join()
# Give servers additional time to fully initialize and coordinate
time.sleep(3)
if len(self.servers) != self.dp_size // self.dp_per_node:
raise Exception("Servers failed to start")
return self.servers
def __exit__(self, exc_type, exc_val, exc_tb):
"""Stop all server instances."""
while self.servers:
try:
self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
except Exception as e:
print(f"Error stopping server: {e}")
class APIOnlyServerManager:
"""Manages API-only server (Node 0) and headless engines server (Node 1)
for testing separated API server and engine configuration."""
def __init__(self,
model_name: str,
dp_size: int,
api_server_count: int,
base_server_args: list,
tp_size: int = TP_SIZE):
self.model_name = model_name
self.dp_size = dp_size
self.tp_size = tp_size
self.api_server_count = api_server_count
self.base_server_args = base_server_args
self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
self.server_threads: list[threading.Thread] = []
def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
"""Start API-only server and headless engines server."""
# Start API-only server (Node 0) - no engines, only API server
api_server_args = self.base_server_args.copy()
api_server_args.extend([
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
"0", # No engines on this node
"--tensor-parallel-size",
str(self.tp_size),
"--port",
"8000",
"--api-server-count",
str(self.api_server_count),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
# Start headless engines server (Node 1) - all engines, no API server
engines_server_args = self.base_server_args.copy()
engines_server_args.extend([
"--headless",
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
str(self.dp_size), # All engines on this node
"--tensor-parallel-size",
str(self.tp_size),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
# Use threads to start both servers in parallel
def start_api_server():
try:
server = RemoteOpenAIServer(
self.model_name,
api_server_args,
auto_port=False,
env_dict={}) # No GPUs needed for API-only server
server.__enter__()
print(f"API-only server started successfully with "
f"{self.api_server_count} API servers")
self.servers.append((server, api_server_args))
except Exception as e:
print(f"Failed to start API-only server: {e}")
raise
def start_engines_server():
try:
server = RemoteOpenAIServer(
self.model_name,
engines_server_args,
auto_port=False,
env_dict={
"CUDA_VISIBLE_DEVICES":
",".join(
str(Platform.device_id_to_physical_device_id(i))
for i in range(self.dp_size * self.tp_size))
})
server.__enter__()
print(f"Headless engines server started successfully with "
f"{self.dp_size} engines")
self.servers.append((server, engines_server_args))
except Exception as e:
print(f"Failed to start headless engines server: {e}")
raise
# Start API server first
api_thread = threading.Thread(target=start_api_server)
api_thread.start()
self.server_threads.append(api_thread)
# Start engines server second
engines_thread = threading.Thread(target=start_engines_server)
engines_thread.start()
self.server_threads.append(engines_thread)
# Wait for both servers to start
for thread in self.server_threads:
thread.join()
# Give servers additional time to fully initialize and coordinate
time.sleep(3)
if len(self.servers) != 2:
raise Exception("Both servers failed to start")
return self.servers
def __exit__(self, exc_type, exc_val, exc_tb):
"""Stop both server instances."""
while self.servers:
try:
self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
except Exception as e:
print(f"Error stopping server: {e}")
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
]
@pytest.fixture(scope="module", params=[1, 4])
def servers(request, default_server_args):
api_server_count = request.param
with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
api_server_count,
default_server_args,
DP_SIZE // NUM_NODES,
TP_SIZE) as server_list:
yield server_list
@pytest.fixture(scope="module", params=[1, 4])
def api_only_servers(request, default_server_args):
"""Fixture for API-only server + headless engines configuration."""
api_server_count = request.param
with APIOnlyServerManager(MODEL_NAME, DP_SIZE, api_server_count,
default_server_args, TP_SIZE) as server_list:
yield server_list
@pytest_asyncio.fixture
async def client(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
# For internal LB, we only connect to the head node (rank 0)
# which provides the single API endpoint
head_server = servers[0][0]
async with head_server.get_async_client() as client:
yield client
@pytest_asyncio.fixture
async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer,
list[str]]]):
"""Client fixture for API-only server configuration."""
# Connect to the API-only server (first server in the list)
api_server = api_only_servers[0][0]
async with api_server.get_async_client() as client:
yield client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
servers: list[tuple[RemoteOpenAIServer,
list[str]]],
model_name: str) -> None:
async def make_request():
completion = await client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=10,
temperature=1.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert len(choice.text) >= 1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert choice.finish_reason in ("length", "stop")
# Token counts can also vary, so we check they are positive.
assert completion.usage.completion_tokens > 0
assert completion.usage.prompt_tokens > 0
assert completion.usage.total_tokens > 0
return completion
# Test single request
result = await make_request()
assert result is not None
print(
"Multi-node internal LB handled single completion request successfully"
)
await asyncio.sleep(0.5)
# Send multiple requests - internal LB should distribute across DP ranks
num_requests = 50
all_tasks = [make_request() for _ in range(num_requests)]
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
await asyncio.sleep(0.5)
# Second burst of requests
all_tasks = [make_request() for _ in range(num_requests)]
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(f"Successfully completed multi-node internal LB test with "
f"{len(servers)} DP ranks (API server count: {api_server_count})")
# Check request balancing via Prometheus metrics
head_server = servers[0][0]
check_request_balancing(head_server, DP_SIZE)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
servers: list[
tuple[RemoteOpenAIServer,
list[str]]],
model_name: str) -> None:
prompt = "What is an LLM?"
async def make_streaming_request():
# Perform a non-streaming request to get the expected full output
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
# Perform the streaming request
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: list[str] = []
finish_reason_count = 0
last_chunk = None
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
last_chunk = chunk # Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert finish_reason_count == 1, (
"Finish reason should appear exactly once.")
assert last_chunk is not None, (
"Stream should have yielded at least one chunk.")
assert last_chunk.choices[
0].finish_reason == "length", "Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert "".join(
chunks
) == single_output, "Streamed output should match non-streamed output."
return True # Indicate success for this request
# Test single streaming request
result = await make_streaming_request()
assert result is not None
print(
"Multi-node internal LB handled single streaming request successfully")
await asyncio.sleep(0.5)
# Send multiple streaming requests - internal LB should distribute across
# DP ranks
num_requests = 50
all_tasks = [make_streaming_request() for _ in range(num_requests)]
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5)
# Second burst of streaming requests
all_tasks = [make_streaming_request() for _ in range(num_requests)]
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(f"Successfully completed multi-node internal LB streaming test with "
f"{len(servers)} DP ranks (API server count: {api_server_count})")
# Check request balancing via Prometheus metrics
head_server = servers[0][0]
check_request_balancing(head_server, DP_SIZE)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_api_only_multinode_dp_completion(
api_only_client: openai.AsyncOpenAI,
api_only_servers: list[tuple[RemoteOpenAIServer,
list[str]]], model_name: str) -> None:
"""Test API-only server with all engines on separate headless server."""
async def make_request():
completion = await api_only_client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=10,
temperature=1.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert len(choice.text) >= 1
# Finish reason might not always be 'length' if the model finishes
# early or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert choice.finish_reason in ("length", "stop")
# Token counts can also vary, so we check they are positive.
assert completion.usage.completion_tokens > 0
assert completion.usage.prompt_tokens > 0
assert completion.usage.total_tokens > 0
return completion
# Test single request
result = await make_request()
assert result is not None
print("API-only server handled single completion request successfully")
await asyncio.sleep(0.5)
# Send multiple requests - should be distributed across engines on
# headless server
num_requests = 50
all_tasks = [make_request() for _ in range(num_requests)]
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
await asyncio.sleep(0.5)
# Second burst of requests
all_tasks = [make_request() for _ in range(num_requests)]
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
_, api_server_args = api_only_servers[0]
api_server_count = (
api_server_args.count('--api-server-count')
and api_server_args[api_server_args.index('--api-server-count') + 1]
or 1)
print(f"Successfully completed API-only multi-node test with {DP_SIZE} "
f"engines on headless server (API server count: {api_server_count})")
# Check request balancing via Prometheus metrics
api_server = api_only_servers[0][0]
check_request_balancing(api_server, DP_SIZE)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_api_only_multinode_dp_completion_streaming(
api_only_client: openai.AsyncOpenAI,
api_only_servers: list[tuple[RemoteOpenAIServer,
list[str]]], model_name: str) -> None:
"""Test API-only server streaming with all engines on separate
headless server."""
prompt = "What is an LLM?"
async def make_streaming_request():
# Perform a non-streaming request to get the expected full output
single_completion = await api_only_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
# Perform the streaming request
stream = await api_only_client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: list[str] = []
finish_reason_count = 0
last_chunk = None
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
last_chunk = chunk # Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert finish_reason_count == 1, (
"Finish reason should appear exactly once.")
assert last_chunk is not None, (
"Stream should have yielded at least one chunk.")
assert last_chunk.choices[
0].finish_reason == "length", "Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert "".join(
chunks
) == single_output, "Streamed output should match non-streamed output."
return True # Indicate success for this request
# Test single streaming request
result = await make_streaming_request()
assert result is not None
print("API-only server handled single streaming request successfully")
await asyncio.sleep(0.5)
# Send multiple streaming requests - should be distributed across engines
num_requests = 50
all_tasks = [make_streaming_request() for _ in range(num_requests)]
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5)
# Second burst of streaming requests
all_tasks = [make_streaming_request() for _ in range(num_requests)]
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
_, api_server_args = api_only_servers[0]
api_server_count = (
api_server_args.count('--api-server-count')
and api_server_args[api_server_args.index('--api-server-count') + 1]
or 1)
print(f"Successfully completed API-only streaming test with {DP_SIZE} "
f"engines on headless server (API server count: {api_server_count})")
# Check request balancing via Prometheus metrics
api_server = api_only_servers[0][0]
check_request_balancing(api_server, DP_SIZE)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment