".buildkite/vscode:/vscode.git/clone" did not exist on "3c3c547ce0b35fb9d43808e8609f5e86fc34cca1"
Commit 3fb4b5fa authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.0' into v0.18.0-ori

parents bcf25339 89138b21
...@@ -28,7 +28,7 @@ if dp_size > 1: ...@@ -28,7 +28,7 @@ if dp_size > 1:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# set different `gpu_memory_utilization` and `swap_space` for different ranks, # set different `gpu_memory_utilization` for different ranks,
# to test if all ranks agree on the same kv cache configuration. # to test if all ranks agree on the same kv cache configuration.
llm = LLM( llm = LLM(
model="microsoft/Phi-mini-MoE-instruct", model="microsoft/Phi-mini-MoE-instruct",
...@@ -37,7 +37,6 @@ llm = LLM( ...@@ -37,7 +37,6 @@ llm = LLM(
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1, enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
distributed_executor_backend="external_launcher", distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9), gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0, seed=0,
) )
......
...@@ -66,7 +66,7 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2): ...@@ -66,7 +66,7 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):
def gpu_worker(rank, WORLD_SIZE, port1, port2): def gpu_worker(rank, WORLD_SIZE, port1, port2):
torch.cuda.set_device(rank) torch.accelerator.set_device_index(rank)
pg1 = StatelessProcessGroup.create( pg1 = StatelessProcessGroup.create(
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
) )
...@@ -79,11 +79,11 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2): ...@@ -79,11 +79,11 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
data = torch.tensor([rank]).cuda() data = torch.tensor([rank]).cuda()
pynccl1.all_reduce(data) pynccl1.all_reduce(data)
pg1.barrier() pg1.barrier()
torch.cuda.synchronize() torch.accelerator.synchronize()
if rank <= 2: if rank <= 2:
pynccl2.all_reduce(data) pynccl2.all_reduce(data)
pg2.barrier() pg2.barrier()
torch.cuda.synchronize() torch.accelerator.synchronize()
item = data[0].item() item = data[0].item()
print(f"rank: {rank}, item: {item}") print(f"rank: {rank}, item: {item}")
if rank == 3: if rank == 3:
......
...@@ -3,18 +3,26 @@ ...@@ -3,18 +3,26 @@
"""Tests for weight transfer engine backends. """Tests for weight transfer engine backends.
Unit tests for engine classes (parsing, validation, registry). Unit tests for engine classes (parsing, validation, registry).
Integration test for NCCL weight transfer between processes using Ray. Integration tests for NCCL and IPC weight transfer between processes using Ray.
""" """
import base64
import pickle
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pytest import pytest
import ray import ray
import torch import torch
from torch.multiprocessing.reductions import reduce_tensor
from vllm.config.parallel import ParallelConfig from vllm.config.parallel import ParallelConfig
from vllm.config.weight_transfer import WeightTransferConfig from vllm.config.weight_transfer import WeightTransferConfig
from vllm.distributed.weight_transfer import WeightTransferEngineFactory from vllm.distributed.weight_transfer import WeightTransferEngineFactory
from vllm.distributed.weight_transfer.ipc_engine import (
IPCWeightTransferEngine,
IPCWeightTransferInitInfo,
IPCWeightTransferUpdateInfo,
)
from vllm.distributed.weight_transfer.nccl_engine import ( from vllm.distributed.weight_transfer.nccl_engine import (
NCCLWeightTransferEngine, NCCLWeightTransferEngine,
NCCLWeightTransferInitInfo, NCCLWeightTransferInitInfo,
...@@ -155,9 +163,29 @@ class TestEngineRegistry: ...@@ -155,9 +163,29 @@ class TestEngineRegistry:
engine = WeightTransferEngineFactory.create_engine(config, parallel_config) engine = WeightTransferEngineFactory.create_engine(config, parallel_config)
assert isinstance(engine, NCCLWeightTransferEngine) assert isinstance(engine, NCCLWeightTransferEngine)
def test_create_engine_ipc(self):
"""Test factory creates IPC engine."""
config = WeightTransferConfig(backend="ipc")
parallel_config = create_mock_parallel_config()
engine = WeightTransferEngineFactory.create_engine(config, parallel_config)
assert isinstance(engine, IPCWeightTransferEngine)
def test_create_engine_invalid_backend(self): def test_create_engine_invalid_backend(self):
"""Test factory raises for invalid backend.""" """Test factory raises for invalid backend."""
config = WeightTransferConfig(backend="invalid") # Pydantic validates Literal types at construction, so we can't create
# a config with an invalid backend. Instead, we test by directly
# accessing the registry or using model_construct to bypass validation.
from pydantic import ValidationError
# Test that Pydantic prevents invalid backend at construction
with pytest.raises(ValidationError):
WeightTransferConfig(backend="invalid")
# Test factory error by creating a config with valid backend but
# then manually modifying the backend attribute (bypassing validation)
config = WeightTransferConfig(backend="nccl")
# Use object.__setattr__ to bypass Pydantic validation
object.__setattr__(config, "backend", "invalid")
parallel_config = create_mock_parallel_config() parallel_config = create_mock_parallel_config()
with pytest.raises(ValueError, match="Invalid weight transfer backend"): with pytest.raises(ValueError, match="Invalid weight transfer backend"):
WeightTransferEngineFactory.create_engine(config, parallel_config) WeightTransferEngineFactory.create_engine(config, parallel_config)
...@@ -175,7 +203,7 @@ class TestEngineRegistry: ...@@ -175,7 +203,7 @@ class TestEngineRegistry:
def test_nccl_receive_weights_without_init_raises(): def test_nccl_receive_weights_without_init_raises():
"""Test that receive_weights raises if init_transfer_engine wasn't called.""" """Test that receive_weights raises if init_transfer_engine wasn't called."""
if torch.cuda.device_count() < 1: if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test") pytest.skip("Need at least 1 GPU for this test")
config = WeightTransferConfig(backend="nccl") config = WeightTransferConfig(backend="nccl")
...@@ -223,7 +251,7 @@ def trainer_broadcast_tensor( ...@@ -223,7 +251,7 @@ def trainer_broadcast_tensor(
dtype = getattr(torch, tensor_dtype) dtype = getattr(torch, tensor_dtype)
tensor_to_send = torch.ones(tensor_shape, dtype=dtype, device="cuda:0") tensor_to_send = torch.ones(tensor_shape, dtype=dtype, device="cuda:0")
comm.broadcast(tensor_to_send, src=0, stream=torch.cuda.current_stream()) comm.broadcast(tensor_to_send, src=0, stream=torch.cuda.current_stream())
torch.cuda.synchronize() torch.accelerator.synchronize()
return True return True
...@@ -281,7 +309,7 @@ def inference_receive_tensor( ...@@ -281,7 +309,7 @@ def inference_receive_tensor(
shapes=[tensor_shape], shapes=[tensor_shape],
) )
engine.receive_weights(update_info, noop_load_weights) engine.receive_weights(update_info, noop_load_weights)
torch.cuda.synchronize() torch.accelerator.synchronize()
# Verify we received the tensor # Verify we received the tensor
success = False success = False
...@@ -308,7 +336,7 @@ def inference_receive_tensor( ...@@ -308,7 +336,7 @@ def inference_receive_tensor(
@pytest.mark.skipif( @pytest.mark.skipif(
torch.cuda.device_count() < 2, torch.accelerator.device_count() < 2,
reason="Need at least 2 GPUs to run NCCL weight transfer test.", reason="Need at least 2 GPUs to run NCCL weight transfer test.",
) )
def test_nccl_weight_transfer_between_processes(): def test_nccl_weight_transfer_between_processes():
...@@ -344,3 +372,442 @@ def test_nccl_weight_transfer_between_processes(): ...@@ -344,3 +372,442 @@ def test_nccl_weight_transfer_between_processes():
f"Received shape: {result['received_shape']}, " f"Received shape: {result['received_shape']}, "
f"Received sum: {result['received_sum']}" f"Received sum: {result['received_sum']}"
) )
# --- Unit Tests: IPCWeightTransferUpdateInfo Validation ---
class TestIPCWeightTransferUpdateInfoValidation:
"""Test IPCWeightTransferUpdateInfo dataclass validation."""
def test_valid_update_info(self):
"""Test creating valid IPCWeightTransferUpdateInfo."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
# Create a dummy tensor and IPC handle
dummy_tensor = torch.ones(10, 10, device="cuda:0")
ipc_handle = reduce_tensor(dummy_tensor)
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
ipc_handles = [{gpu_uuid: ipc_handle}]
info = IPCWeightTransferUpdateInfo(
names=["layer.weight"],
dtype_names=["float32"],
shapes=[[10, 10]],
ipc_handles=ipc_handles,
)
assert info.names == ["layer.weight"]
assert info.dtype_names == ["float32"]
assert info.shapes == [[10, 10]]
assert len(info.ipc_handles) == 1
def test_mismatched_dtype_names_raises(self):
"""Test that mismatched dtype_names length raises ValueError."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
dummy_tensor = torch.ones(10, 10, device="cuda:0")
ipc_handle = reduce_tensor(dummy_tensor)
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
ipc_handles = [{gpu_uuid: ipc_handle}, {gpu_uuid: ipc_handle}]
with pytest.raises(ValueError, match="dtype_names"):
IPCWeightTransferUpdateInfo(
names=["layer.weight", "layer.bias"],
dtype_names=["float32"], # Only one dtype
shapes=[[10, 10], [10]],
ipc_handles=ipc_handles,
)
def test_mismatched_shapes_raises(self):
"""Test that mismatched shapes length raises ValueError."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
dummy_tensor = torch.ones(10, 10, device="cuda:0")
ipc_handle = reduce_tensor(dummy_tensor)
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
ipc_handles = [{gpu_uuid: ipc_handle}, {gpu_uuid: ipc_handle}]
with pytest.raises(ValueError, match="shapes"):
IPCWeightTransferUpdateInfo(
names=["layer.weight", "layer.bias"],
dtype_names=["float32", "float32"],
shapes=[[10, 10]], # Only one shape
ipc_handles=ipc_handles,
)
def test_mismatched_ipc_handles_raises(self):
"""Test that mismatched ipc_handles length raises ValueError."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
dummy_tensor = torch.ones(10, 10, device="cuda:0")
ipc_handle = reduce_tensor(dummy_tensor)
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
ipc_handles = [{gpu_uuid: ipc_handle}] # Only one handle
with pytest.raises(ValueError, match="ipc_handles"):
IPCWeightTransferUpdateInfo(
names=["layer.weight", "layer.bias"],
dtype_names=["float32", "float32"],
shapes=[[10, 10], [10]],
ipc_handles=ipc_handles,
)
def test_valid_update_info_from_pickled(self, monkeypatch):
"""Test creating IPCWeightTransferUpdateInfo from pickled handles."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
dummy_tensor = torch.ones(10, 10, device="cuda:0")
ipc_handle = reduce_tensor(dummy_tensor)
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
ipc_handles = [{gpu_uuid: ipc_handle}]
pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
info = IPCWeightTransferUpdateInfo(
names=["layer.weight"],
dtype_names=["float32"],
shapes=[[10, 10]],
ipc_handles_pickled=pickled,
)
assert info.ipc_handles == ipc_handles
assert info.ipc_handles_pickled is None
def test_pickled_requires_insecure_serialization_flag(self, monkeypatch):
"""Test that pickled handles are rejected unless env flag is enabled."""
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
with pytest.raises(ValueError, match="VLLM_ALLOW_INSECURE_SERIALIZATION=1"):
IPCWeightTransferUpdateInfo(
names=[],
dtype_names=[],
shapes=[],
ipc_handles_pickled=base64.b64encode(pickle.dumps([])).decode("utf-8"),
)
def test_both_handles_and_pickled_raises(self):
"""Test that providing both ipc_handles and ipc_handles_pickled raises."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
dummy_tensor = torch.ones(10, 10, device="cuda:0")
ipc_handle = reduce_tensor(dummy_tensor)
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
ipc_handles = [{gpu_uuid: ipc_handle}]
pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
with pytest.raises(ValueError, match="Cannot specify both"):
IPCWeightTransferUpdateInfo(
names=["layer.weight"],
dtype_names=["float32"],
shapes=[[10, 10]],
ipc_handles=ipc_handles,
ipc_handles_pickled=pickled,
)
def test_neither_handles_nor_pickled_raises(self):
"""Test that providing neither ipc_handles nor ipc_handles_pickled raises."""
with pytest.raises(ValueError, match="must be provided"):
IPCWeightTransferUpdateInfo(
names=["layer.weight"],
dtype_names=["float32"],
shapes=[[10, 10]],
)
def test_empty_lists_valid(self):
"""Test that empty lists are valid."""
info = IPCWeightTransferUpdateInfo(
names=[],
dtype_names=[],
shapes=[],
ipc_handles=[],
)
assert len(info.names) == 0
# --- Unit Tests: IPC Engine Parsing ---
class TestIPCEngineParsing:
"""Test IPCWeightTransferEngine parsing methods."""
def test_parse_update_info_valid(self):
"""Test parsing valid update info dict."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
config = WeightTransferConfig(backend="ipc")
parallel_config = create_mock_parallel_config()
engine = IPCWeightTransferEngine(config, parallel_config)
# Create dummy IPC handles
dummy_tensor1 = torch.ones(100, 100, device="cuda:0")
dummy_tensor2 = torch.ones(50, device="cuda:0")
ipc_handle1 = reduce_tensor(dummy_tensor1)
ipc_handle2 = reduce_tensor(dummy_tensor2)
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
ipc_handles = [{gpu_uuid: ipc_handle1}, {gpu_uuid: ipc_handle2}]
update_info = engine.parse_update_info(
{
"names": ["w1", "w2"],
"dtype_names": ["float32", "bfloat16"],
"shapes": [[100, 100], [50]],
"ipc_handles": ipc_handles,
}
)
assert isinstance(update_info, IPCWeightTransferUpdateInfo)
assert update_info.names == ["w1", "w2"]
assert update_info.dtype_names == ["float32", "bfloat16"]
assert update_info.shapes == [[100, 100], [50]]
assert len(update_info.ipc_handles) == 2
def test_parse_update_info_pickled(self, monkeypatch):
"""Test parsing update info with pickled IPC handles (HTTP path)."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
config = WeightTransferConfig(backend="ipc")
parallel_config = create_mock_parallel_config()
engine = IPCWeightTransferEngine(config, parallel_config)
dummy_tensor1 = torch.ones(100, 100, device="cuda:0")
dummy_tensor2 = torch.ones(50, device="cuda:0")
ipc_handle1 = reduce_tensor(dummy_tensor1)
ipc_handle2 = reduce_tensor(dummy_tensor2)
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
ipc_handles = [{gpu_uuid: ipc_handle1}, {gpu_uuid: ipc_handle2}]
pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
update_info = engine.parse_update_info(
{
"names": ["w1", "w2"],
"dtype_names": ["float32", "bfloat16"],
"shapes": [[100, 100], [50]],
"ipc_handles_pickled": pickled,
}
)
assert isinstance(update_info, IPCWeightTransferUpdateInfo)
assert update_info.names == ["w1", "w2"]
assert len(update_info.ipc_handles) == 2
assert update_info.ipc_handles_pickled is None
assert gpu_uuid in update_info.ipc_handles[0]
assert gpu_uuid in update_info.ipc_handles[1]
# --- Integration Test: IPC Weight Transfer Between Ray Tasks ---
def get_physical_gpu_id(device_index: int = 0) -> str:
"""Get physical GPU UUID for a device."""
props = torch.cuda.get_device_properties(device_index)
return str(props.uuid)
@ray.remote(num_gpus=0.5)
class TrainerActor:
"""Trainer actor that creates and holds CUDA IPC handles."""
def __init__(self, tensor_shape: list[int], tensor_dtype: str):
# Create tensor on GPU and keep it alive
dtype = getattr(torch, tensor_dtype)
self.tensor = torch.ones(tensor_shape, dtype=dtype, device="cuda:0")
self.tensor.fill_(42.0) # Fill with 42 to verify correct transfer
# Create IPC handle (tensor must stay alive for IPC to work)
ipc_handle = reduce_tensor(self.tensor)
gpu_uuid = get_physical_gpu_id(0)
torch.accelerator.synchronize()
self.ipc_handle_dict = {
"ipc_handle": ipc_handle,
"gpu_uuid": gpu_uuid,
"shape": tensor_shape,
"dtype": tensor_dtype,
}
def get_ipc_handle_dict(self) -> dict:
"""Return IPC handle dict. Tensor stays alive in this actor."""
return self.ipc_handle_dict
@ray.remote(num_gpus=0.5)
def inference_receive_ipc_tensor(
ipc_handle_dict: dict,
mode: str = "ray",
) -> dict:
"""Inference task that receives tensor via IPCWeightTransferEngine."""
from unittest.mock import MagicMock
import torch
from vllm.config.parallel import ParallelConfig
from vllm.config.weight_transfer import WeightTransferConfig
from vllm.distributed.weight_transfer.ipc_engine import (
IPCWeightTransferEngine,
)
# Create engine with mock parallel config
config = WeightTransferConfig(backend="ipc")
parallel_config = MagicMock(spec=ParallelConfig)
parallel_config.rank = 0
parallel_config.world_size = 1
parallel_config.data_parallel_rank = 0
engine = IPCWeightTransferEngine(config, parallel_config)
# Initialize the engine (no-op for IPC)
init_info = IPCWeightTransferInitInfo()
engine.init_transfer_engine(init_info)
# Receive weights with a no-op load_weights that captures the tensor
received_tensors = []
def noop_load_weights(weights: list[tuple[str, torch.Tensor]]):
for name, tensor in weights:
# Clone tensor to keep it after engine cleans up
received_tensors.append((name, tensor.clone()))
# Build update dict and go through parse_update_info (exercises __post_init__)
ipc_handles = [{ipc_handle_dict["gpu_uuid"]: ipc_handle_dict["ipc_handle"]}]
if mode == "ray":
update_dict: dict = {
"names": ["test.weight"],
"dtype_names": [ipc_handle_dict["dtype"]],
"shapes": [ipc_handle_dict["shape"]],
"ipc_handles": ipc_handles,
}
elif mode == "http":
pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
update_dict = {
"names": ["test.weight"],
"dtype_names": [ipc_handle_dict["dtype"]],
"shapes": [ipc_handle_dict["shape"]],
"ipc_handles_pickled": pickled,
}
else:
raise ValueError(f"Unknown mode: {mode}")
update_info = engine.parse_update_info(update_dict)
engine.receive_weights(update_info, noop_load_weights)
torch.accelerator.synchronize()
# Verify we received the tensor
success = False
received_shape = None
received_sum = None
if len(received_tensors) == 1:
name, tensor = received_tensors[0]
received_shape = list(tensor.shape)
received_sum = tensor.sum().item()
# Check shape matches and values are all 42s (trainer sends 42s)
if received_shape == ipc_handle_dict["shape"]:
expected_sum = 42.0 * torch.tensor(ipc_handle_dict["shape"]).prod().item()
if abs(received_sum - expected_sum) < 0.01:
success = True
engine.shutdown()
return {
"success": success,
"received_shape": received_shape,
"received_sum": received_sum,
}
@pytest.mark.skipif(
torch.accelerator.device_count() < 1,
reason="Need at least 1 GPU to run IPC weight transfer test.",
)
@pytest.mark.parametrize("mode", ["ray", "http"])
def test_ipc_weight_transfer_between_processes(mode: str):
"""Test IPC weight transfer from trainer to inference process using Ray.
Parametrized over transport modes:
- 'ray': ipc_handles passed directly.
- 'http': ipc_handles pickled + base64-encoded, unpickled via __post_init__.
IPC requires same-GPU access, so we use a placement group to co-locate
the trainer actor and inference task on the same GPU.
"""
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
ray.init(ignore_reinit_error=True)
# Create a placement group to ensure both processes are on the same GPU
# Use fractional GPUs so both tasks can share the same GPU bundle
pg = placement_group([{"GPU": 1, "CPU": 2}])
ray.get(pg.ready())
scheduling_strategy = PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_capture_child_tasks=True,
)
# Tensor to transfer: 100x100 filled with 42s
tensor_shape = [100, 100]
tensor_dtype = "float32"
# Create trainer actor that holds the tensor and IPC handle (stays alive)
trainer_actor = TrainerActor.options( # type: ignore[attr-defined]
scheduling_strategy=scheduling_strategy
).remote(tensor_shape, tensor_dtype)
# Get IPC handle dict (tensor stays alive in trainer actor)
ipc_handle_dict = ray.get(trainer_actor.get_ipc_handle_dict.remote())
# Receive tensor in inference process using IPC handles (on same GPU)
# Trainer actor stays alive during this operation
inference_result = ray.get(
inference_receive_ipc_tensor.options(
scheduling_strategy=scheduling_strategy
).remote(ipc_handle_dict, mode=mode)
)
assert inference_result["success"], (
f"IPC weight transfer failed (mode={mode}). "
f"Received shape: {inference_result['received_shape']}, "
f"Received sum: {inference_result['received_sum']}"
)
def test_ipc_receive_weights_missing_gpu_uuid_raises():
"""Test that receive_weights raises if GPU UUID not found in IPC handles."""
if torch.accelerator.device_count() < 1:
pytest.skip("Need at least 1 GPU for this test")
config = WeightTransferConfig(backend="ipc")
parallel_config = create_mock_parallel_config()
engine = IPCWeightTransferEngine(config, parallel_config)
# Create IPC handle with wrong GPU UUID
dummy_tensor = torch.ones(10, 10, device="cuda:0")
ipc_handle = reduce_tensor(dummy_tensor)
wrong_uuid = "wrong-uuid-12345"
ipc_handles = [{wrong_uuid: ipc_handle}]
update_info = IPCWeightTransferUpdateInfo(
names=["w"],
dtype_names=["float32"],
shapes=[[10, 10]],
ipc_handles=ipc_handles,
)
with pytest.raises(ValueError, match="IPC handle not found"):
engine.receive_weights(update_info, lambda x: None)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Unit tests for Anthropic-to-OpenAI request conversion.
Tests the image source handling and tool_result content parsing in
AnthropicServingMessages._convert_anthropic_to_openai_request().
Also covers extended-thinking edge cases such as ``redacted_thinking``
blocks echoed back by Anthropic clients.
"""
from vllm.entrypoints.anthropic.protocol import (
AnthropicMessagesRequest,
)
from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
_convert = AnthropicServingMessages._convert_anthropic_to_openai_request
_img_url = AnthropicServingMessages._convert_image_source_to_url
def _make_request(
messages: list[dict],
**kwargs,
) -> AnthropicMessagesRequest:
return AnthropicMessagesRequest(
model="test-model",
max_tokens=128,
messages=messages,
**kwargs,
)
# ======================================================================
# _convert_image_source_to_url
# ======================================================================
class TestConvertImageSourceToUrl:
def test_base64_source(self):
source = {
"type": "base64",
"media_type": "image/jpeg",
"data": "iVBORw0KGgo=",
}
assert _img_url(source) == "data:image/jpeg;base64,iVBORw0KGgo="
def test_base64_png(self):
source = {
"type": "base64",
"media_type": "image/png",
"data": "AAAA",
}
assert _img_url(source) == "data:image/png;base64,AAAA"
def test_url_source(self):
source = {
"type": "url",
"url": "https://example.com/image.jpg",
}
assert _img_url(source) == "https://example.com/image.jpg"
def test_missing_type_defaults_to_base64(self):
"""When 'type' is absent, treat as base64."""
source = {
"media_type": "image/webp",
"data": "UklGR",
}
assert _img_url(source) == "data:image/webp;base64,UklGR"
def test_missing_media_type_defaults_to_jpeg(self):
source = {"type": "base64", "data": "abc123"}
assert _img_url(source) == "data:image/jpeg;base64,abc123"
def test_url_source_missing_url_returns_empty(self):
source = {"type": "url"}
assert _img_url(source) == ""
def test_empty_source_returns_data_uri_shell(self):
source: dict = {}
assert _img_url(source) == "data:image/jpeg;base64,"
# ======================================================================
# Image blocks inside user messages
# ======================================================================
class TestImageContentBlocks:
def test_base64_image_in_user_message(self):
request = _make_request(
[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image"},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": "iVBORw0KGgo=",
},
},
],
}
]
)
result = _convert(request)
user_msg = result.messages[0]
assert user_msg["role"] == "user"
parts = user_msg["content"]
assert len(parts) == 2
assert parts[0] == {"type": "text", "text": "Describe this image"}
assert parts[1] == {
"type": "image_url",
"image_url": {"url": "data:image/jpeg;base64,iVBORw0KGgo="},
}
def test_url_image_in_user_message(self):
request = _make_request(
[
{
"role": "user",
"content": [
{"type": "text", "text": "What is this?"},
{
"type": "image",
"source": {
"type": "url",
"url": "https://example.com/cat.png",
},
},
],
}
]
)
result = _convert(request)
parts = result.messages[0]["content"]
assert parts[1] == {
"type": "image_url",
"image_url": {"url": "https://example.com/cat.png"},
}
# ======================================================================
# tool_result content handling
# ======================================================================
class TestToolResultContent:
def _make_tool_result_request(
self, tool_result_content
) -> AnthropicMessagesRequest:
"""Build a request with assistant tool_use followed by user
tool_result."""
return _make_request(
[
{
"role": "assistant",
"content": [
{
"type": "tool_use",
"id": "call_001",
"name": "read_file",
"input": {"path": "/tmp/img.png"},
}
],
},
{
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": "call_001",
"content": tool_result_content,
}
],
},
]
)
def test_tool_result_string_content(self):
request = self._make_tool_result_request("file contents here")
result = _convert(request)
tool_msg = [m for m in result.messages if m["role"] == "tool"]
assert len(tool_msg) == 1
assert tool_msg[0]["content"] == "file contents here"
assert tool_msg[0]["tool_call_id"] == "call_001"
def test_tool_result_text_blocks(self):
request = self._make_tool_result_request(
[
{"type": "text", "text": "line 1"},
{"type": "text", "text": "line 2"},
]
)
result = _convert(request)
tool_msg = [m for m in result.messages if m["role"] == "tool"]
assert len(tool_msg) == 1
assert tool_msg[0]["content"] == "line 1\nline 2"
def test_tool_result_with_image(self):
"""Image in tool_result should produce a follow-up user message."""
request = self._make_tool_result_request(
[
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": "AAAA",
},
}
]
)
result = _convert(request)
tool_msg = [m for m in result.messages if m["role"] == "tool"]
assert len(tool_msg) == 1
assert tool_msg[0]["content"] == ""
# The image should be injected as a follow-up user message
follow_up = [
m
for m in result.messages
if m["role"] == "user" and isinstance(m.get("content"), list)
]
assert len(follow_up) == 1
img_parts = follow_up[0]["content"]
assert len(img_parts) == 1
assert img_parts[0] == {
"type": "image_url",
"image_url": {"url": "data:image/png;base64,AAAA"},
}
def test_tool_result_with_text_and_image(self):
"""Mixed text+image tool_result: text in tool msg, image in user
msg."""
request = self._make_tool_result_request(
[
{"type": "text", "text": "Here is the screenshot"},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": "QUFB",
},
},
]
)
result = _convert(request)
tool_msg = [m for m in result.messages if m["role"] == "tool"]
assert len(tool_msg) == 1
assert tool_msg[0]["content"] == "Here is the screenshot"
follow_up = [
m
for m in result.messages
if m["role"] == "user" and isinstance(m.get("content"), list)
]
assert len(follow_up) == 1
assert follow_up[0]["content"][0]["image_url"]["url"] == (
"data:image/jpeg;base64,QUFB"
)
def test_tool_result_with_multiple_images(self):
request = self._make_tool_result_request(
[
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": "IMG1",
},
},
{
"type": "image",
"source": {
"type": "url",
"url": "https://example.com/img2.jpg",
},
},
]
)
result = _convert(request)
follow_up = [
m
for m in result.messages
if m["role"] == "user" and isinstance(m.get("content"), list)
]
assert len(follow_up) == 1
urls = [p["image_url"]["url"] for p in follow_up[0]["content"]]
assert urls == [
"data:image/png;base64,IMG1",
"https://example.com/img2.jpg",
]
def test_tool_result_none_content(self):
request = self._make_tool_result_request(None)
result = _convert(request)
tool_msg = [m for m in result.messages if m["role"] == "tool"]
assert len(tool_msg) == 1
assert tool_msg[0]["content"] == ""
def test_tool_result_no_follow_up_when_no_images(self):
"""Ensure no extra user message is added when there are no images."""
request = self._make_tool_result_request(
[
{"type": "text", "text": "just text"},
]
)
result = _convert(request)
user_follow_ups = [
m
for m in result.messages
if m["role"] == "user" and isinstance(m.get("content"), list)
]
assert len(user_follow_ups) == 0
# ======================================================================
# Attribution header stripping
# ======================================================================
class TestAttributionHeaderStripping:
def test_billing_header_stripped_from_system(self):
"""Claude Code's x-anthropic-billing-header block should be
stripped to preserve prefix caching."""
request = _make_request(
[{"role": "user", "content": "Hello"}],
system=[
{"type": "text", "text": "You are a helpful assistant."},
{
"type": "text",
"text": "x-anthropic-billing-header: "
"cc_version=2.1.37.abc; cc_entrypoint=cli;",
},
],
)
result = _convert(request)
system_msg = result.messages[0]
assert system_msg["role"] == "system"
assert system_msg["content"] == "You are a helpful assistant."
def test_system_without_billing_header_unchanged(self):
"""Normal system blocks should pass through unchanged."""
request = _make_request(
[{"role": "user", "content": "Hello"}],
system=[
{"type": "text", "text": "You are a helpful assistant."},
{"type": "text", "text": " Be concise."},
],
)
result = _convert(request)
system_msg = result.messages[0]
assert system_msg["content"] == "You are a helpful assistant. Be concise."
def test_system_string_unchanged(self):
"""String system prompts should pass through unchanged."""
request = _make_request(
[{"role": "user", "content": "Hello"}],
system="You are a helpful assistant.",
)
result = _convert(request)
system_msg = result.messages[0]
assert system_msg["content"] == "You are a helpful assistant."
# ======================================================================
# Thinking block conversion (Anthropic → OpenAI)
# ======================================================================
class TestThinkingBlockConversion:
"""Verify that thinking blocks in assistant messages are correctly
moved to the ``reasoning`` field and stripped from ``content`` during
the Anthropic→OpenAI conversion.
This is the Anthropic-endpoint path: the client echoes back the full
assistant message (including thinking blocks emitted by vllm) in
subsequent requests.
"""
def test_thinking_plus_text_in_assistant_message(self):
"""thinking + text → reasoning field + plain-string content."""
request = _make_request(
[
{"role": "user", "content": "Write me some code."},
{
"role": "assistant",
"content": [
{
"type": "thinking",
"thinking": "I should write a simple example.",
"signature": "sig_abc123",
},
{"type": "text", "text": "Sure! Here is the code."},
],
},
{"role": "user", "content": "Can you fix the bug?"},
]
)
result = _convert(request)
# Find the assistant message in the converted output.
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
assert len(asst_msgs) == 1
asst = asst_msgs[0]
# Thinking content must be in reasoning, NOT in content.
assert asst.get("reasoning") == "I should write a simple example."
assert asst.get("content") == "Sure! Here is the code."
def test_thinking_only_in_assistant_message(self):
"""Assistant message with only a thinking block (no visible text).
This can happen when the model emits reasoning but no final answer
yet (e.g. a mid-turn reasoning step). Content should be None.
"""
request = _make_request(
[
{"role": "user", "content": "Hello"},
{
"role": "assistant",
"content": [
{
"type": "thinking",
"thinking": "Just thinking...",
"signature": "sig_xyz",
}
],
},
{"role": "user", "content": "Go on."},
]
)
result = _convert(request)
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
assert len(asst_msgs) == 1
asst = asst_msgs[0]
assert asst.get("reasoning") == "Just thinking..."
# No visible text → content should be absent or None.
assert asst.get("content") is None
def test_thinking_plus_tool_use_in_assistant_message(self):
"""thinking + tool_use: reasoning field set, tool_calls populated."""
request = _make_request(
[
{"role": "user", "content": "What is 2+2?"},
{
"role": "assistant",
"content": [
{
"type": "thinking",
"thinking": "I need to call the calculator.",
"signature": "sig_tool",
},
{
"type": "tool_use",
"id": "call_001",
"name": "calculator",
"input": {"expression": "2+2"},
},
],
},
{
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": "call_001",
"content": "4",
}
],
},
]
)
result = _convert(request)
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
assert len(asst_msgs) == 1
asst = asst_msgs[0]
assert asst.get("reasoning") == "I need to call the calculator."
tool_calls = list(asst.get("tool_calls", []))
assert len(tool_calls) == 1
assert tool_calls[0]["function"]["name"] == "calculator"
# No text content alongside reasoning + tool_use.
assert asst.get("content") is None
def test_multiple_thinking_blocks_concatenated(self):
"""Multiple thinking blocks should be joined in order."""
request = _make_request(
[
{"role": "user", "content": "Think hard."},
{
"role": "assistant",
"content": [
{
"type": "thinking",
"thinking": "First thought. ",
"signature": "s1",
},
{
"type": "thinking",
"thinking": "Second thought.",
"signature": "s2",
},
{"type": "text", "text": "Done."},
],
},
]
)
result = _convert(request)
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
assert len(asst_msgs) == 1
asst = asst_msgs[0]
assert asst.get("reasoning") == "First thought. Second thought."
assert asst.get("content") == "Done."
def test_no_thinking_blocks_unchanged(self):
"""Messages without thinking blocks must not be modified."""
request = _make_request(
[
{"role": "user", "content": "Hi"},
{"role": "assistant", "content": "Hello!"},
]
)
result = _convert(request)
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
assert len(asst_msgs) == 1
asst = asst_msgs[0]
assert asst.get("content") == "Hello!"
assert "reasoning" not in asst
def test_multi_turn_with_thinking_blocks(self):
"""Full multi-turn conversation: previous assistant messages that
include thinking blocks must all be converted without a 400 error.
This is the primary regression scenario from the bug report:
upgrading vllm from v0.15.1 → v0.17.0 introduced thinking-block
support in responses, but echoing those responses back in subsequent
requests caused a Pydantic validation failure.
"""
request = _make_request(
[
{"role": "user", "content": "Turn 1 question"},
{
"role": "assistant",
"content": [
{
"type": "thinking",
"thinking": "Reasoning for turn 1.",
"signature": "s_t1",
},
{"type": "text", "text": "Answer for turn 1."},
],
},
{"role": "user", "content": "Turn 2 question"},
{
"role": "assistant",
"content": [
{
"type": "thinking",
"thinking": "Reasoning for turn 2.",
"signature": "s_t2",
},
{"type": "text", "text": "Answer for turn 2."},
],
},
{"role": "user", "content": "Turn 3 question"},
]
)
# Must not raise a ValidationError / 400.
result = _convert(request)
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
assert len(asst_msgs) == 2
assert asst_msgs[0].get("reasoning") == "Reasoning for turn 1."
assert asst_msgs[0].get("content") == "Answer for turn 1."
assert asst_msgs[1].get("reasoning") == "Reasoning for turn 2."
assert asst_msgs[1].get("content") == "Answer for turn 2."
def test_redacted_thinking_block_is_accepted(self):
"""Anthropic clients may echo back redacted thinking blocks.
vLLM should accept these blocks (to avoid 400 validation errors)
and ignore them when constructing the OpenAI-format prompt.
"""
request = _make_request(
[
{"role": "user", "content": "Hello"},
{
"role": "assistant",
"content": [
{
"type": "thinking",
"thinking": "Thinking...",
"signature": "sig_think",
},
{
"type": "redacted_thinking",
"data": "BASE64_OR_OTHER_OPAQUE_DATA",
},
{"type": "text", "text": "Hi!"},
],
},
{"role": "user", "content": "Continue"},
]
)
result = _convert(request)
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
assert len(asst_msgs) == 1
asst = asst_msgs[0]
# Redacted thinking is ignored, normal thinking still becomes reasoning.
assert asst.get("reasoning") == "Thinking..."
assert asst.get("content") == "Hi!"
...@@ -145,6 +145,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer): ...@@ -145,6 +145,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
model=MODEL_NAME, model=MODEL_NAME,
max_tokens=10000, max_tokens=10000,
extra_body={"min_tokens": 10000}, extra_body={"min_tokens": 10000},
temperature=0.0,
) )
) )
tasks.append(task) tasks.append(task)
...@@ -163,7 +164,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer): ...@@ -163,7 +164,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
# be able to respond to this one within the timeout # be able to respond to this one within the timeout
client = server.get_async_client(timeout=5) client = server.get_async_client(timeout=5)
response = await client.chat.completions.create( response = await client.chat.completions.create(
messages=chat_input, model=MODEL_NAME, max_tokens=10 messages=chat_input, model=MODEL_NAME, max_tokens=10, temperature=0.0
) )
assert len(response.choices) == 1 assert len(response.choices) == 1
......
...@@ -17,6 +17,7 @@ from transformers import AutoTokenizer ...@@ -17,6 +17,7 @@ from transformers import AutoTokenizer
from tests.conftest import LocalAssetServer from tests.conftest import LocalAssetServer
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm import version from vllm import version
from vllm.utils.network_utils import get_open_port
MODELS = { MODELS = {
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
...@@ -315,14 +316,26 @@ async def test_abort_metrics_reset( ...@@ -315,14 +316,26 @@ async def test_abort_metrics_reset(
client.completions.create( client.completions.create(
model=model_name, model=model_name,
prompt=prompt_ids, prompt=prompt_ids,
max_tokens=100, # Long generation to give time to abort max_tokens=500, # Long generation to give time to abort
temperature=0.0, temperature=0.0,
) )
) )
tasks.append(task) tasks.append(task)
# Wait a bit for requests to start processing # Poll until we see running requests rather than using a fixed sleep,
await asyncio.sleep(0.5) # since generation speed varies across hardware.
try:
await _poll_until(
lambda: _get_running_metrics_from_api(server)[0] > 0,
timeout=10.0,
interval=0.1,
description="running_requests > 0",
)
except TimeoutError:
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
pytest.fail("Requests never appeared as running in metrics")
# Check that we have running requests # Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api( running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
...@@ -336,13 +349,15 @@ async def test_abort_metrics_reset( ...@@ -336,13 +349,15 @@ async def test_abort_metrics_reset(
# Cancel all tasks to abort the requests # Cancel all tasks to abort the requests
for task in tasks: for task in tasks:
task.cancel() task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
# Wait for cancellations to be processed
await asyncio.sleep(1.0) # Poll until metrics reset rather than using a fixed sleep
await _poll_until(
# Check that metrics have reset to zero lambda: _get_running_metrics_from_api(server) == (0, 0, 0),
response = requests.get(server.url_for("metrics")) timeout=10.0,
assert response.status_code == HTTPStatus.OK interval=0.2,
description="gauge metrics back to zero",
)
# Verify running and waiting requests counts and KV cache usage are zero # Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = ( running_requests_after, waiting_requests_after, kv_cache_usage_after = (
...@@ -360,6 +375,18 @@ async def test_abort_metrics_reset( ...@@ -360,6 +375,18 @@ async def test_abort_metrics_reset(
) )
async def _poll_until(
predicate, *, timeout: float, interval: float = 0.5, description: str = "condition"
):
"""Poll until predicate() returns True, or raise TimeoutError."""
start = time.time()
while time.time() - start < timeout:
if predicate():
return
await asyncio.sleep(interval)
raise TimeoutError(f"Timed out after {timeout}s waiting for: {description}")
def _get_running_metrics_from_api(server: RemoteOpenAIServer): def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)""" """Return (running_count, waiting_count, kv_cache_usage)"""
...@@ -399,7 +426,7 @@ def test_metrics_exist_run_batch(): ...@@ -399,7 +426,7 @@ def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501 input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
base_url = "0.0.0.0" base_url = "0.0.0.0"
port = "8001" port = str(get_open_port())
server_url = f"http://{base_url}:{port}" server_url = f"http://{base_url}:{port}"
with ( with (
...@@ -420,24 +447,39 @@ def test_metrics_exist_run_batch(): ...@@ -420,24 +447,39 @@ def test_metrics_exist_run_batch():
"--model", "--model",
"intfloat/multilingual-e5-small", "intfloat/multilingual-e5-small",
"--enable-metrics", "--enable-metrics",
"--url", "--host",
base_url, base_url,
"--port", "--port",
port, port,
], ],
) )
def is_server_up(url): try:
def is_server_up(url):
try:
response = requests.get(url)
return response.status_code == 200
except requests.ConnectionError:
return False
start = time.time()
timeout = 120
while not is_server_up(server_url):
if proc.poll() is not None:
pytest.fail(
f"Batch process exited early with returncode={proc.returncode}"
)
if time.time() - start > timeout:
pytest.fail("Batch server did not start within timeout")
time.sleep(1)
response = requests.get(server_url + "/metrics")
assert response.status_code == HTTPStatus.OK
finally:
proc.terminate()
try: try:
response = requests.get(url) proc.wait(timeout=15)
return response.status_code == 200 except subprocess.TimeoutExpired:
except requests.ConnectionError: proc.kill()
return False proc.wait(timeout=5)
while not is_server_up(server_url):
time.sleep(1)
response = requests.get(server_url + "/metrics")
assert response.status_code == HTTPStatus.OK
proc.wait()
...@@ -195,18 +195,15 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test): ...@@ -195,18 +195,15 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
valid_msg = [{"role": "user", "content": "Hello"}] valid_msg = [{"role": "user", "content": "Hello"}]
long_text = "This is a very long text to test the error " * 50 long_text = "This is a very long text to test the error " * 50
invalid_msg = [{"role": "user", "content": long_text}] invalid_msg = [{"role": "user", "content": long_text}]
batch_1 = [
valid_msg, batch_1 = [valid_msg, valid_msg, invalid_msg]
valid_msg, batch_2 = [valid_msg, valid_msg]
invalid_msg,
]
batch_2 = [
valid_msg,
valid_msg,
]
sampling_params = SamplingParams(temperature=0, max_tokens=10) sampling_params = SamplingParams(temperature=0, max_tokens=10)
with pytest.raises(ValueError, match="context length is only"):
with pytest.raises(ValueError, match="maximum context length is"):
llm.chat(batch_1, sampling_params=sampling_params) llm.chat(batch_1, sampling_params=sampling_params)
assert llm.llm_engine.get_num_unfinished_requests() == 0
outputs_2 = llm.chat(batch_2, sampling_params=sampling_params) outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
assert len(outputs_2) == len(batch_2) assert len(outputs_2) == len(batch_2)
assert llm.llm_engine.get_num_unfinished_requests() == 0 assert llm.llm_engine.get_num_unfinished_requests() == 0
...@@ -13,7 +13,7 @@ from ...utils import create_new_process_for_each_test ...@@ -13,7 +13,7 @@ from ...utils import create_new_process_for_each_test
@pytest.mark.parametrize("backend", ["mp", "ray"]) @pytest.mark.parametrize("backend", ["mp", "ray"])
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_collective_rpc(tp_size, backend, monkeypatch): def test_collective_rpc(tp_size, backend, monkeypatch):
if torch.cuda.device_count() < tp_size: if torch.accelerator.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
if tp_size == 1 and backend == "ray": if tp_size == 1 and backend == "ray":
pytest.skip("Skip duplicate test case") pytest.skip("Skip duplicate test case")
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
# imports for structured outputs tests # imports for structured outputs tests
import json import json
from collections import defaultdict
import jsonschema import jsonschema
import openai # use the official client for correctness check import openai # use the official client for correctness check
...@@ -13,7 +14,11 @@ import requests ...@@ -13,7 +14,11 @@ import requests
import torch import torch
from openai import BadRequestError from openai import BadRequestError
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.sampling_params import SamplingParams
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
...@@ -815,3 +820,203 @@ async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenA ...@@ -815,3 +820,203 @@ async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenA
assert chat_output.keys() == invocation_output.keys() assert chat_output.keys() == invocation_output.keys()
assert chat_output["choices"] == invocation_output["choices"] assert chat_output["choices"] == invocation_output["choices"]
# Test n parameter for chat completions
# Tests that the n parameter works correctly for regular sampling
# (non-beam search) in chat completions, addressing issue #34305.
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_chat_completion_n_parameter_non_streaming(
client: openai.AsyncOpenAI, model_name: str
):
"""Test that n parameter returns multiple choices for non-streaming requests."""
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the opposite of big?"},
]
# Test with n=3
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=20,
temperature=0.7,
n=3,
stream=False,
)
assert len(chat_completion.choices) == 3
# Verify each choice has content and correct index
for i, choice in enumerate(chat_completion.choices):
assert choice.index == i
assert choice.message.content is not None
assert len(choice.message.content) > 0
# Verify all responses are different (highly likely with temperature > 0)
contents = [choice.message.content for choice in chat_completion.choices]
assert len(set(contents)) > 1, "Expected different responses with n=3"
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_chat_completion_n_parameter_streaming(
client: openai.AsyncOpenAI, model_name: str
):
"""Test that n parameter returns multiple choices for streaming requests."""
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
]
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=15,
temperature=0.7,
n=2,
stream=True,
)
# Collect all chunks using defaultdict for dynamic handling
chunks_by_index = defaultdict(list)
async for chunk in stream:
for choice in chunk.choices:
if choice.delta.content:
chunks_by_index[choice.index].append(choice.delta.content)
# Verify both choices received content
assert len(chunks_by_index[0]) > 0, "Choice 0 received no content chunks"
assert len(chunks_by_index[1]) > 0, "Choice 1 received no content chunks"
# Reconstruct full responses
response_0 = "".join(chunks_by_index[0])
response_1 = "".join(chunks_by_index[1])
assert len(response_0) > 0, "Choice 0 has empty response"
assert len(response_1) > 0, "Choice 1 has empty response"
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_chat_completion_n_with_seed(client: openai.AsyncOpenAI, model_name: str):
"""Test that n parameter works correctly with seed parameter."""
messages = [
{"role": "user", "content": "Say hello."},
]
# Test that seed parameter is accepted and works with n > 1
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.8,
n=2,
seed=42,
stream=False,
)
# Verify we get n=2 choices
assert len(chat_completion.choices) == 2
# Verify both choices have valid content
for i, choice in enumerate(chat_completion.choices):
assert choice.index == i
assert choice.message.content is not None
assert len(choice.message.content) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_chat_completion_n_equals_1(client: openai.AsyncOpenAI, model_name: str):
"""Test that n=1 (default) still works correctly."""
messages = [
{"role": "user", "content": "Hello!"},
]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.7,
n=1,
stream=False,
)
assert len(chat_completion.choices) == 1
assert chat_completion.choices[0].index == 0
assert chat_completion.choices[0].message.content is not None
# Unit tests for n parameter in ChatCompletionRequest.to_sampling_params()
def test_chat_completion_request_n_parameter_to_sampling_params():
"""Test that n parameter is correctly passed to SamplingParams."""
# Test with n=3
request = ChatCompletionRequest(
model="test-model",
messages=[{"role": "user", "content": "Hello"}],
n=3,
max_tokens=10,
)
sampling_params = request.to_sampling_params(
max_tokens=10,
default_sampling_params={},
)
assert isinstance(sampling_params, SamplingParams)
assert sampling_params.n == 3, f"Expected n=3, got n={sampling_params.n}"
def test_chat_completion_request_n_parameter_default():
"""Test that n parameter defaults to 1."""
request = ChatCompletionRequest(
model="test-model",
messages=[{"role": "user", "content": "Hello"}],
# n not specified, should default to 1
max_tokens=10,
)
assert request.n == 1, "n should default to 1"
sampling_params = request.to_sampling_params(
max_tokens=10,
default_sampling_params={},
)
# SamplingParams.from_optional converts None to 1
assert sampling_params.n == 1, f"Expected n=1 (default), got n={sampling_params.n}"
def test_chat_completion_request_n_parameter_various_values():
"""Test n parameter with various values."""
for n_value in [1, 2, 5, 10]:
request = ChatCompletionRequest(
model="test-model",
messages=[{"role": "user", "content": "Test"}],
n=n_value,
max_tokens=10,
)
sampling_params = request.to_sampling_params(
max_tokens=10,
default_sampling_params={},
)
assert sampling_params.n == n_value, (
f"Expected n={n_value}, got n={sampling_params.n}"
)
...@@ -7,10 +7,9 @@ import openai # use the official client for correctness check ...@@ -7,10 +7,9 @@ import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.config import ModelConfig from vllm.config import ModelConfig
from ...utils import RemoteOpenAIServer
# # any model with a chat template should work here # # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
......
...@@ -2,18 +2,18 @@ ...@@ -2,18 +2,18 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass, field from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Any from typing import Any
from unittest.mock import AsyncMock, MagicMock from unittest.mock import AsyncMock, MagicMock, patch
import pytest import pytest
from vllm.config.multimodal import MultiModalConfig from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.outputs import CompletionOutput, RequestOutput from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config from vllm.tokenizers.registry import tokenizer_args_from_config
...@@ -44,7 +44,7 @@ class MockModelConfig: ...@@ -44,7 +44,7 @@ class MockModelConfig:
tokenizer_revision = None tokenizer_revision = None
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig() hf_config = MockHFConfig()
logits_processor_pattern = None hf_text_config = MockHFConfig()
logits_processors: list[str] | None = None logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None diff_sampling_param: dict | None = None
allowed_local_media_path: str = "" allowed_local_media_path: str = ""
...@@ -54,16 +54,28 @@ class MockModelConfig: ...@@ -54,16 +54,28 @@ class MockModelConfig:
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False skip_tokenizer_init = False
is_encoder_decoder: bool = False is_encoder_decoder: bool = False
is_multimodal_model: bool = False
def get_diff_sampling_param(self): def get_diff_sampling_param(self):
return self.diff_sampling_param or {} return self.diff_sampling_param or {}
@dataclass
class MockParallelConfig:
_api_process_rank: int = 0
@dataclass
class MockVllmConfig:
model_config: MockModelConfig
parallel_config: MockParallelConfig
def _build_renderer(model_config: MockModelConfig): def _build_renderer(model_config: MockModelConfig):
_, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)
return HfRenderer( return HfRenderer.from_config(
model_config, MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
) )
...@@ -73,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: ...@@ -73,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
engine_client=engine, engine_client=engine,
base_model_paths=BASE_MODEL_PATHS, base_model_paths=BASE_MODEL_PATHS,
) )
serving_render = OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
serving_chat = OpenAIServingChat( serving_chat = OpenAIServingChat(
engine, engine,
models, models,
response_role="assistant", response_role="assistant",
openai_serving_render=serving_render,
request_logger=None, request_logger=None,
chat_template=None, chat_template=None,
chat_template_content_format="auto", chat_template_content_format="auto",
...@@ -89,7 +111,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: ...@@ -89,7 +111,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
[{"prompt_token_ids": [1, 2, 3]}], [{"prompt_token_ids": [1, 2, 3]}],
) )
serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat) serving_chat.openai_serving_render._preprocess_chat = AsyncMock(
side_effect=_fake_preprocess_chat
)
return serving_chat return serving_chat
...@@ -139,12 +163,8 @@ async def test_chat_error_non_stream(): ...@@ -139,12 +163,8 @@ async def test_chat_error_non_stream():
stream=False, stream=False,
) )
response = await serving_chat.create_chat_completion(request) with pytest.raises(GenerationError):
await serving_chat.create_chat_completion(request)
assert isinstance(response, ErrorResponse)
assert response.error.type == "InternalServerError"
assert response.error.message == "Internal server error"
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -227,3 +247,152 @@ async def test_chat_error_stream(): ...@@ -227,3 +247,152 @@ async def test_chat_error_stream():
f"Expected error message in chunks: {chunks}" f"Expected error message in chunks: {chunks}"
) )
assert chunks[-1] == "data: [DONE]\n\n" assert chunks[-1] == "data: [DONE]\n\n"
@pytest.mark.parametrize(
"image_content",
[
[{"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}],
[{"image_url": {"url": "https://example.com/image.jpg"}}],
],
)
def test_system_message_warns_on_image(image_content):
"""Test that system messages with image content trigger a warning."""
with patch(
"vllm.entrypoints.openai.chat_completion.protocol.logger"
) as mock_logger:
ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": image_content,
}
],
)
mock_logger.warning_once.assert_called()
call_args = str(mock_logger.warning_once.call_args)
assert "System messages should only contain text" in call_args
assert "image_url" in call_args
def test_system_message_accepts_text():
"""Test that system messages can contain text content."""
# Should not raise an exception
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
],
)
assert request.messages[0]["role"] == "system"
def test_system_message_accepts_text_array():
"""Test that system messages can contain an array with text content."""
# Should not raise an exception
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}],
},
],
)
assert request.messages[0]["role"] == "system"
def test_user_message_accepts_image():
"""Test that user messages can still contain image content."""
# Should not raise an exception
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image.jpg"},
},
],
},
],
)
assert request.messages[0]["role"] == "user"
@pytest.mark.parametrize(
"audio_content",
[
[
{
"type": "input_audio",
"input_audio": {"data": "base64data", "format": "wav"},
}
],
[{"input_audio": {"data": "base64data", "format": "wav"}}],
],
)
def test_system_message_warns_on_audio(audio_content):
"""Test that system messages with audio content trigger a warning."""
with patch(
"vllm.entrypoints.openai.chat_completion.protocol.logger"
) as mock_logger:
ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": audio_content,
}
],
)
mock_logger.warning_once.assert_called()
call_args = str(mock_logger.warning_once.call_args)
assert "System messages should only contain text" in call_args
assert "input_audio" in call_args
@pytest.mark.parametrize(
"video_content",
[
[{"type": "video_url", "video_url": {"url": "https://example.com/video.mp4"}}],
[{"video_url": {"url": "https://example.com/video.mp4"}}],
],
)
def test_system_message_warns_on_video(video_content):
"""Test that system messages with video content trigger a warning."""
with patch(
"vllm.entrypoints.openai.chat_completion.protocol.logger"
) as mock_logger:
ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": video_content,
}
],
)
mock_logger.warning_once.assert_called()
call_args = str(mock_logger.warning_once.call_args)
assert "System messages should only contain text" in call_args
assert "video_url" in call_args
def test_json_schema_response_format_missing_schema():
"""When response_format type is 'json_schema' but the json_schema field
is not provided, request construction should raise a validation error
so the API returns 400 instead of 500."""
with pytest.raises(Exception, match="json_schema.*must be provided"):
ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "hello"}],
response_format={"type": "json_schema"},
)
...@@ -5,10 +5,9 @@ import openai ...@@ -5,10 +5,9 @@ import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.config import ModelConfig from vllm.config import ModelConfig
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
......
...@@ -5,7 +5,7 @@ import openai # use the official client for correctness check ...@@ -5,7 +5,7 @@ import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
# a reasoning and tool calling model # a reasoning and tool calling model
MODEL_NAME = "Qwen/QwQ-32B" MODEL_NAME = "Qwen/QwQ-32B"
......
...@@ -10,11 +10,12 @@ import pytest ...@@ -10,11 +10,12 @@ import pytest
import pytest_asyncio import pytest_asyncio
# downloading lora to test lora requests # downloading lora to test lora requests
from ...utils import RemoteOpenAIServer from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B" MODEL_NAME = "Qwen/Qwen3-0.6B"
tools = [ tools = [
{ {
"type": "function", "type": "function",
...@@ -139,9 +140,12 @@ def server(): ...@@ -139,9 +140,12 @@ def server():
"qwen3", "qwen3",
"--gpu-memory-utilization", "--gpu-memory-utilization",
"0.4", "0.4",
] "--enforce-eager",
] + ROCM_EXTRA_ARGS
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(
MODEL_NAME, args, env_dict=ROCM_ENV_OVERRIDES
) as remote_server:
yield remote_server yield remote_server
...@@ -226,12 +230,13 @@ def k2_server(): ...@@ -226,12 +230,13 @@ def k2_server():
"qwen3", "qwen3",
"--gpu-memory-utilization", "--gpu-memory-utilization",
"0.4", "0.4",
] ] + ROCM_EXTRA_ARGS
# hack to test kimi_k2 tool use tool_id format. # hack to test kimi_k2 tool use tool_id format.
# avoid error in is_deepseek_mla check by setting kv_lora_rank=null # avoid error in is_deepseek_mla check by setting kv_lora_rank=null
with RemoteOpenAIServer( with RemoteOpenAIServer(
MODEL_NAME, MODEL_NAME,
args, args,
env_dict=ROCM_ENV_OVERRIDES,
override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None}, override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
) as remote_server: ) as remote_server:
yield remote_server yield remote_server
...@@ -294,7 +299,10 @@ async def test_no_args_tool_call( ...@@ -294,7 +299,10 @@ async def test_no_args_tool_call(
"type": "function", "type": "function",
"function": { "function": {
"name": "get_current_time", "name": "get_current_time",
"description": "Get the current date and time. No parameters needed.", "description": (
"Get the current date and time. Call this when the user "
"asks what time or date it is. No parameters needed."
),
"parameters": { "parameters": {
"type": "object", "type": "object",
"properties": {}, # No parameters "properties": {}, # No parameters
...@@ -303,10 +311,28 @@ async def test_no_args_tool_call( ...@@ -303,10 +311,28 @@ async def test_no_args_tool_call(
}, },
} }
] ]
messages = [{"role": "user", "content": "What time is it now?"}] messages = [
{
"role": "system",
"content": (
"You are a helpful assistant. Always use the available tools "
"when relevant, and reply with a short sentence after "
"receiving a tool result."
),
},
{"role": "user", "content": "What time is it now?"},
]
shared_kwargs = dict(
model=model_name,
temperature=0.0,
seed=42,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
# Step 2: Send user message and let model decide whether to call the tool # Step 2: Send user message and let model decide whether to call the tool
response = await client.chat.completions.create( response = await client.chat.completions.create(
model=model_name, **shared_kwargs,
messages=messages, messages=messages,
tools=tools, tools=tools,
tool_choice="auto", # Let model choose automatically tool_choice="auto", # Let model choose automatically
...@@ -334,11 +360,15 @@ async def test_no_args_tool_call( ...@@ -334,11 +360,15 @@ async def test_no_args_tool_call(
) )
# Step 5: Send tool result back to model to continue conversation # Step 5: Send tool result back to model to continue conversation
final_response = await client.chat.completions.create( final_response = await client.chat.completions.create(
model=model_name, **shared_kwargs,
messages=messages, messages=messages,
max_completion_tokens=128,
) )
# Output final natural language response # Output final natural language response
assert final_response.choices[0].message.content is not None assert (
final_response.choices[0].message.content is not None
and final_response.choices[0].message.content.strip() != ""
)
else: else:
# No tool called — just print model's direct reply # No tool called — just print model's direct reply
...@@ -484,3 +514,27 @@ async def test_inconsistent_tool_choice_and_tools( ...@@ -484,3 +514,27 @@ async def test_inconsistent_tool_choice_and_tools(
], ],
tool_choice={}, tool_choice={},
) )
@pytest.mark.asyncio
async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
""" """
models = await client.models.list()
model_name: str = models.data[0].id
# This combination previously crashed the engine
chat_completion = await client.chat.completions.create(
messages=messages,
temperature=0,
max_completion_tokens=1,
model=model_name,
tools=tools,
tool_choice="required",
)
# When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`,
# both `tool_calls` and `content` should be empty.
# This behavior should be consistent with OpenAI.
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert len(choice.message.tool_calls) == 0
assert choice.message.content == ""
...@@ -4,7 +4,7 @@ import openai ...@@ -4,7 +4,7 @@ import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment