Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/tests/distributed/test_torchrun_example_moe.py
+++ b/tests/distributed/test_torchrun_example_moe.py
@@ -28,7 +28,7 @@ if dp_size > 1:
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# set different `gpu_memory_utilization` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
 llm = LLM(
    model="microsoft/Phi-mini-MoE-instruct",
@@ -37,7 +37,6 @@ llm = LLM(
    enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
    distributed_executor_backend="external_launcher",
    gpu_memory_utilization=random.uniform(0.7, 0.9),
-    swap_space=random.randint(1, 4),
    seed=0,
 )

--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -66,7 +66,7 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):
 def gpu_worker(rank, WORLD_SIZE, port1, port2):
-    torch.cuda.set_device(rank)
+    torch.accelerator.set_device_index(rank)
    pg1 = StatelessProcessGroup.create(
        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
    )
@@ -79,11 +79,11 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
    data = torch.tensor([rank]).cuda()
    pynccl1.all_reduce(data)
    pg1.barrier()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    if rank <= 2:
        pynccl2.all_reduce(data)
        pg2.barrier()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    item = data[0].item()
    print(f"rank: {rank}, item: {item}")
    if rank == 3:

--- a/tests/distributed/test_weight_transfer.py
+++ b/tests/distributed/test_weight_transfer.py
@@ -3,18 +3,26 @@
 """Tests for weight transfer engine backends.
 Unit tests for engine classes (parsing, validation, registry).
-Integration test for NCCL weight transfer between processes using Ray.
+Integration tests for NCCL and IPC weight transfer between processes using Ray.
 """
+import base64
+import pickle
 from unittest.mock import MagicMock
 import pytest
 import ray
 import torch
+from torch.multiprocessing.reductions import reduce_tensor
 from vllm.config.parallel import ParallelConfig
 from vllm.config.weight_transfer import WeightTransferConfig
 from vllm.distributed.weight_transfer import WeightTransferEngineFactory
+from vllm.distributed.weight_transfer.ipc_engine import (
+    IPCWeightTransferEngine,
+    IPCWeightTransferInitInfo,
+    IPCWeightTransferUpdateInfo,
+)
 from vllm.distributed.weight_transfer.nccl_engine import (
    NCCLWeightTransferEngine,
    NCCLWeightTransferInitInfo,
@@ -155,9 +163,29 @@ class TestEngineRegistry:
        engine = WeightTransferEngineFactory.create_engine(config, parallel_config)
        assert isinstance(engine, NCCLWeightTransferEngine)
+    def test_create_engine_ipc(self):
+        """Test factory creates IPC engine."""
+        config = WeightTransferConfig(backend="ipc")
+        parallel_config = create_mock_parallel_config()
+        engine = WeightTransferEngineFactory.create_engine(config, parallel_config)
+        assert isinstance(engine, IPCWeightTransferEngine)
    def test_create_engine_invalid_backend(self):
        """Test factory raises for invalid backend."""
-        config = WeightTransferConfig(backend="invalid")
+        # Pydantic validates Literal types at construction, so we can't create
+        # a config with an invalid backend. Instead, we test by directly
+        # accessing the registry or using model_construct to bypass validation.
+        from pydantic import ValidationError
+        # Test that Pydantic prevents invalid backend at construction
+        with pytest.raises(ValidationError):
+            WeightTransferConfig(backend="invalid")
+        # Test factory error by creating a config with valid backend but
+        # then manually modifying the backend attribute (bypassing validation)
+        config = WeightTransferConfig(backend="nccl")
+        # Use object.__setattr__ to bypass Pydantic validation
+        object.__setattr__(config, "backend", "invalid")
        parallel_config = create_mock_parallel_config()
        with pytest.raises(ValueError, match="Invalid weight transfer backend"):
            WeightTransferEngineFactory.create_engine(config, parallel_config)
@@ -175,7 +203,7 @@ class TestEngineRegistry:
 def test_nccl_receive_weights_without_init_raises():
    """Test that receive_weights raises if init_transfer_engine wasn't called."""
-    if torch.cuda.device_count() < 1:
+    if torch.accelerator.device_count() < 1:
        pytest.skip("Need at least 1 GPU for this test")
    config = WeightTransferConfig(backend="nccl")
@@ -223,7 +251,7 @@ def trainer_broadcast_tensor(
    dtype = getattr(torch, tensor_dtype)
    tensor_to_send = torch.ones(tensor_shape, dtype=dtype, device="cuda:0")
    comm.broadcast(tensor_to_send, src=0, stream=torch.cuda.current_stream())
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return True
@@ -281,7 +309,7 @@ def inference_receive_tensor(
        shapes=[tensor_shape],
    )
    engine.receive_weights(update_info, noop_load_weights)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Verify we received the tensor
    success = False
@@ -308,7 +336,7 @@ def inference_receive_tensor(
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2,
+    torch.accelerator.device_count() < 2,
    reason="Need at least 2 GPUs to run NCCL weight transfer test.",
 )
 def test_nccl_weight_transfer_between_processes():
@@ -344,3 +372,442 @@ def test_nccl_weight_transfer_between_processes():
        f"Received shape: {result['received_shape']}, "
        f"Received sum: {result['received_sum']}"
    )
+# --- Unit Tests: IPCWeightTransferUpdateInfo Validation ---
+class TestIPCWeightTransferUpdateInfoValidation:
+    """Test IPCWeightTransferUpdateInfo dataclass validation."""
+    def test_valid_update_info(self):
+        """Test creating valid IPCWeightTransferUpdateInfo."""
+        if torch.accelerator.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+        # Create a dummy tensor and IPC handle
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}]
+        info = IPCWeightTransferUpdateInfo(
+            names=["layer.weight"],
+            dtype_names=["float32"],
+            shapes=[[10, 10]],
+            ipc_handles=ipc_handles,
+        )
+        assert info.names == ["layer.weight"]
+        assert info.dtype_names == ["float32"]
+        assert info.shapes == [[10, 10]]
+        assert len(info.ipc_handles) == 1
+    def test_mismatched_dtype_names_raises(self):
+        """Test that mismatched dtype_names length raises ValueError."""
+        if torch.accelerator.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}, {gpu_uuid: ipc_handle}]
+        with pytest.raises(ValueError, match="dtype_names"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight", "layer.bias"],
+                dtype_names=["float32"],  # Only one dtype
+                shapes=[[10, 10], [10]],
+                ipc_handles=ipc_handles,
+            )
+    def test_mismatched_shapes_raises(self):
+        """Test that mismatched shapes length raises ValueError."""
+        if torch.accelerator.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}, {gpu_uuid: ipc_handle}]
+        with pytest.raises(ValueError, match="shapes"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight", "layer.bias"],
+                dtype_names=["float32", "float32"],
+                shapes=[[10, 10]],  # Only one shape
+                ipc_handles=ipc_handles,
+            )
+    def test_mismatched_ipc_handles_raises(self):
+        """Test that mismatched ipc_handles length raises ValueError."""
+        if torch.accelerator.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}]  # Only one handle
+        with pytest.raises(ValueError, match="ipc_handles"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight", "layer.bias"],
+                dtype_names=["float32", "float32"],
+                shapes=[[10, 10], [10]],
+                ipc_handles=ipc_handles,
+            )
+    def test_valid_update_info_from_pickled(self, monkeypatch):
+        """Test creating IPCWeightTransferUpdateInfo from pickled handles."""
+        if torch.accelerator.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+        monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}]
+        pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
+        info = IPCWeightTransferUpdateInfo(
+            names=["layer.weight"],
+            dtype_names=["float32"],
+            shapes=[[10, 10]],
+            ipc_handles_pickled=pickled,
+        )
+        assert info.ipc_handles == ipc_handles
+        assert info.ipc_handles_pickled is None
+    def test_pickled_requires_insecure_serialization_flag(self, monkeypatch):
+        """Test that pickled handles are rejected unless env flag is enabled."""
+        monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
+        with pytest.raises(ValueError, match="VLLM_ALLOW_INSECURE_SERIALIZATION=1"):
+            IPCWeightTransferUpdateInfo(
+                names=[],
+                dtype_names=[],
+                shapes=[],
+                ipc_handles_pickled=base64.b64encode(pickle.dumps([])).decode("utf-8"),
+            )
+    def test_both_handles_and_pickled_raises(self):
+        """Test that providing both ipc_handles and ipc_handles_pickled raises."""
+        if torch.accelerator.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}]
+        pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
+        with pytest.raises(ValueError, match="Cannot specify both"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight"],
+                dtype_names=["float32"],
+                shapes=[[10, 10]],
+                ipc_handles=ipc_handles,
+                ipc_handles_pickled=pickled,
+            )
+    def test_neither_handles_nor_pickled_raises(self):
+        """Test that providing neither ipc_handles nor ipc_handles_pickled raises."""
+        with pytest.raises(ValueError, match="must be provided"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight"],
+                dtype_names=["float32"],
+                shapes=[[10, 10]],
+            )
+    def test_empty_lists_valid(self):
+        """Test that empty lists are valid."""
+        info = IPCWeightTransferUpdateInfo(
+            names=[],
+            dtype_names=[],
+            shapes=[],
+            ipc_handles=[],
+        )
+        assert len(info.names) == 0
+# --- Unit Tests: IPC Engine Parsing ---
+class TestIPCEngineParsing:
+    """Test IPCWeightTransferEngine parsing methods."""
+    def test_parse_update_info_valid(self):
+        """Test parsing valid update info dict."""
+        if torch.accelerator.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+        config = WeightTransferConfig(backend="ipc")
+        parallel_config = create_mock_parallel_config()
+        engine = IPCWeightTransferEngine(config, parallel_config)
+        # Create dummy IPC handles
+        dummy_tensor1 = torch.ones(100, 100, device="cuda:0")
+        dummy_tensor2 = torch.ones(50, device="cuda:0")
+        ipc_handle1 = reduce_tensor(dummy_tensor1)
+        ipc_handle2 = reduce_tensor(dummy_tensor2)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle1}, {gpu_uuid: ipc_handle2}]
+        update_info = engine.parse_update_info(
+            {
+                "names": ["w1", "w2"],
+                "dtype_names": ["float32", "bfloat16"],
+                "shapes": [[100, 100], [50]],
+                "ipc_handles": ipc_handles,
+            }
+        )
+        assert isinstance(update_info, IPCWeightTransferUpdateInfo)
+        assert update_info.names == ["w1", "w2"]
+        assert update_info.dtype_names == ["float32", "bfloat16"]
+        assert update_info.shapes == [[100, 100], [50]]
+        assert len(update_info.ipc_handles) == 2
+    def test_parse_update_info_pickled(self, monkeypatch):
+        """Test parsing update info with pickled IPC handles (HTTP path)."""
+        if torch.accelerator.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+        monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        config = WeightTransferConfig(backend="ipc")
+        parallel_config = create_mock_parallel_config()
+        engine = IPCWeightTransferEngine(config, parallel_config)
+        dummy_tensor1 = torch.ones(100, 100, device="cuda:0")
+        dummy_tensor2 = torch.ones(50, device="cuda:0")
+        ipc_handle1 = reduce_tensor(dummy_tensor1)
+        ipc_handle2 = reduce_tensor(dummy_tensor2)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle1}, {gpu_uuid: ipc_handle2}]
+        pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
+        update_info = engine.parse_update_info(
+            {
+                "names": ["w1", "w2"],
+                "dtype_names": ["float32", "bfloat16"],
+                "shapes": [[100, 100], [50]],
+                "ipc_handles_pickled": pickled,
+            }
+        )
+        assert isinstance(update_info, IPCWeightTransferUpdateInfo)
+        assert update_info.names == ["w1", "w2"]
+        assert len(update_info.ipc_handles) == 2
+        assert update_info.ipc_handles_pickled is None
+        assert gpu_uuid in update_info.ipc_handles[0]
+        assert gpu_uuid in update_info.ipc_handles[1]
+# --- Integration Test: IPC Weight Transfer Between Ray Tasks ---
+def get_physical_gpu_id(device_index: int = 0) -> str:
+    """Get physical GPU UUID for a device."""
+    props = torch.cuda.get_device_properties(device_index)
+    return str(props.uuid)
+@ray.remote(num_gpus=0.5)
+class TrainerActor:
+    """Trainer actor that creates and holds CUDA IPC handles."""
+    def __init__(self, tensor_shape: list[int], tensor_dtype: str):
+        # Create tensor on GPU and keep it alive
+        dtype = getattr(torch, tensor_dtype)
+        self.tensor = torch.ones(tensor_shape, dtype=dtype, device="cuda:0")
+        self.tensor.fill_(42.0)  # Fill with 42 to verify correct transfer
+        # Create IPC handle (tensor must stay alive for IPC to work)
+        ipc_handle = reduce_tensor(self.tensor)
+        gpu_uuid = get_physical_gpu_id(0)
+        torch.accelerator.synchronize()
+        self.ipc_handle_dict = {
+            "ipc_handle": ipc_handle,
+            "gpu_uuid": gpu_uuid,
+            "shape": tensor_shape,
+            "dtype": tensor_dtype,
+        }
+    def get_ipc_handle_dict(self) -> dict:
+        """Return IPC handle dict. Tensor stays alive in this actor."""
+        return self.ipc_handle_dict
+@ray.remote(num_gpus=0.5)
+def inference_receive_ipc_tensor(
+    ipc_handle_dict: dict,
+    mode: str = "ray",
+) -> dict:
+    """Inference task that receives tensor via IPCWeightTransferEngine."""
+    from unittest.mock import MagicMock
+    import torch
+    from vllm.config.parallel import ParallelConfig
+    from vllm.config.weight_transfer import WeightTransferConfig
+    from vllm.distributed.weight_transfer.ipc_engine import (
+        IPCWeightTransferEngine,
+    )
+    # Create engine with mock parallel config
+    config = WeightTransferConfig(backend="ipc")
+    parallel_config = MagicMock(spec=ParallelConfig)
+    parallel_config.rank = 0
+    parallel_config.world_size = 1
+    parallel_config.data_parallel_rank = 0
+    engine = IPCWeightTransferEngine(config, parallel_config)
+    # Initialize the engine (no-op for IPC)
+    init_info = IPCWeightTransferInitInfo()
+    engine.init_transfer_engine(init_info)
+    # Receive weights with a no-op load_weights that captures the tensor
+    received_tensors = []
+    def noop_load_weights(weights: list[tuple[str, torch.Tensor]]):
+        for name, tensor in weights:
+            # Clone tensor to keep it after engine cleans up
+            received_tensors.append((name, tensor.clone()))
+    # Build update dict and go through parse_update_info (exercises __post_init__)
+    ipc_handles = [{ipc_handle_dict["gpu_uuid"]: ipc_handle_dict["ipc_handle"]}]
+    if mode == "ray":
+        update_dict: dict = {
+            "names": ["test.weight"],
+            "dtype_names": [ipc_handle_dict["dtype"]],
+            "shapes": [ipc_handle_dict["shape"]],
+            "ipc_handles": ipc_handles,
+        }
+    elif mode == "http":
+        pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
+        update_dict = {
+            "names": ["test.weight"],
+            "dtype_names": [ipc_handle_dict["dtype"]],
+            "shapes": [ipc_handle_dict["shape"]],
+            "ipc_handles_pickled": pickled,
+        }
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+    update_info = engine.parse_update_info(update_dict)
+    engine.receive_weights(update_info, noop_load_weights)
+    torch.accelerator.synchronize()
+    # Verify we received the tensor
+    success = False
+    received_shape = None
+    received_sum = None
+    if len(received_tensors) == 1:
+        name, tensor = received_tensors[0]
+        received_shape = list(tensor.shape)
+        received_sum = tensor.sum().item()
+        # Check shape matches and values are all 42s (trainer sends 42s)
+        if received_shape == ipc_handle_dict["shape"]:
+            expected_sum = 42.0 * torch.tensor(ipc_handle_dict["shape"]).prod().item()
+            if abs(received_sum - expected_sum) < 0.01:
+                success = True
+    engine.shutdown()
+    return {
+        "success": success,
+        "received_shape": received_shape,
+        "received_sum": received_sum,
+    }
+@pytest.mark.skipif(
+    torch.accelerator.device_count() < 1,
+    reason="Need at least 1 GPU to run IPC weight transfer test.",
+)
+@pytest.mark.parametrize("mode", ["ray", "http"])
+def test_ipc_weight_transfer_between_processes(mode: str):
+    """Test IPC weight transfer from trainer to inference process using Ray.
+    Parametrized over transport modes:
+    - 'ray':  ipc_handles passed directly.
+    - 'http': ipc_handles pickled + base64-encoded, unpickled via __post_init__.
+    IPC requires same-GPU access, so we use a placement group to co-locate
+    the trainer actor and inference task on the same GPU.
+    """
+    from ray.util.placement_group import placement_group
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+    ray.init(ignore_reinit_error=True)
+    # Create a placement group to ensure both processes are on the same GPU
+    # Use fractional GPUs so both tasks can share the same GPU bundle
+    pg = placement_group([{"GPU": 1, "CPU": 2}])
+    ray.get(pg.ready())
+    scheduling_strategy = PlacementGroupSchedulingStrategy(
+        placement_group=pg,
+        placement_group_capture_child_tasks=True,
+    )
+    # Tensor to transfer: 100x100 filled with 42s
+    tensor_shape = [100, 100]
+    tensor_dtype = "float32"
+    # Create trainer actor that holds the tensor and IPC handle (stays alive)
+    trainer_actor = TrainerActor.options(  # type: ignore[attr-defined]
+        scheduling_strategy=scheduling_strategy
+    ).remote(tensor_shape, tensor_dtype)
+    # Get IPC handle dict (tensor stays alive in trainer actor)
+    ipc_handle_dict = ray.get(trainer_actor.get_ipc_handle_dict.remote())
+    # Receive tensor in inference process using IPC handles (on same GPU)
+    # Trainer actor stays alive during this operation
+    inference_result = ray.get(
+        inference_receive_ipc_tensor.options(
+            scheduling_strategy=scheduling_strategy
+        ).remote(ipc_handle_dict, mode=mode)
+    )
+    assert inference_result["success"], (
+        f"IPC weight transfer failed (mode={mode}). "
+        f"Received shape: {inference_result['received_shape']}, "
+        f"Received sum: {inference_result['received_sum']}"
+    )
+def test_ipc_receive_weights_missing_gpu_uuid_raises():
+    """Test that receive_weights raises if GPU UUID not found in IPC handles."""
+    if torch.accelerator.device_count() < 1:
+        pytest.skip("Need at least 1 GPU for this test")
+    config = WeightTransferConfig(backend="ipc")
+    parallel_config = create_mock_parallel_config()
+    engine = IPCWeightTransferEngine(config, parallel_config)
+    # Create IPC handle with wrong GPU UUID
+    dummy_tensor = torch.ones(10, 10, device="cuda:0")
+    ipc_handle = reduce_tensor(dummy_tensor)
+    wrong_uuid = "wrong-uuid-12345"
+    ipc_handles = [{wrong_uuid: ipc_handle}]
+    update_info = IPCWeightTransferUpdateInfo(
+        names=["w"],
+        dtype_names=["float32"],
+        shapes=[[10, 10]],
+        ipc_handles=ipc_handles,
+    )
+    with pytest.raises(ValueError, match="IPC handle not found"):
+        engine.receive_weights(update_info, lambda x: None)
--- a/tests/entrypoints/sleep/__init__.py
+++ b/tests/entrypoints/sleep/__init__.py
--- a/tests/entrypoints/anthropic/test_anthropic_messages_conversion.py
+++ b/tests/entrypoints/anthropic/test_anthropic_messages_conversion.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for Anthropic-to-OpenAI request conversion.
+Tests the image source handling and tool_result content parsing in
+AnthropicServingMessages._convert_anthropic_to_openai_request().
+Also covers extended-thinking edge cases such as ``redacted_thinking``
+blocks echoed back by Anthropic clients.
+"""
+from vllm.entrypoints.anthropic.protocol import (
+    AnthropicMessagesRequest,
+)
+from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
+_convert = AnthropicServingMessages._convert_anthropic_to_openai_request
+_img_url = AnthropicServingMessages._convert_image_source_to_url
+def _make_request(
+    messages: list[dict],
+    **kwargs,
+) -> AnthropicMessagesRequest:
+    return AnthropicMessagesRequest(
+        model="test-model",
+        max_tokens=128,
+        messages=messages,
+        **kwargs,
+    )
+# ======================================================================
+# _convert_image_source_to_url
+# ======================================================================
+class TestConvertImageSourceToUrl:
+    def test_base64_source(self):
+        source = {
+            "type": "base64",
+            "media_type": "image/jpeg",
+            "data": "iVBORw0KGgo=",
+        }
+        assert _img_url(source) == "data:image/jpeg;base64,iVBORw0KGgo="
+    def test_base64_png(self):
+        source = {
+            "type": "base64",
+            "media_type": "image/png",
+            "data": "AAAA",
+        }
+        assert _img_url(source) == "data:image/png;base64,AAAA"
+    def test_url_source(self):
+        source = {
+            "type": "url",
+            "url": "https://example.com/image.jpg",
+        }
+        assert _img_url(source) == "https://example.com/image.jpg"
+    def test_missing_type_defaults_to_base64(self):
+        """When 'type' is absent, treat as base64."""
+        source = {
+            "media_type": "image/webp",
+            "data": "UklGR",
+        }
+        assert _img_url(source) == "data:image/webp;base64,UklGR"
+    def test_missing_media_type_defaults_to_jpeg(self):
+        source = {"type": "base64", "data": "abc123"}
+        assert _img_url(source) == "data:image/jpeg;base64,abc123"
+    def test_url_source_missing_url_returns_empty(self):
+        source = {"type": "url"}
+        assert _img_url(source) == ""
+    def test_empty_source_returns_data_uri_shell(self):
+        source: dict = {}
+        assert _img_url(source) == "data:image/jpeg;base64,"
+# ======================================================================
+# Image blocks inside user messages
+# ======================================================================
+class TestImageContentBlocks:
+    def test_base64_image_in_user_message(self):
+        request = _make_request(
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Describe this image"},
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/jpeg",
+                                "data": "iVBORw0KGgo=",
+                            },
+                        },
+                    ],
+                }
+            ]
+        )
+        result = _convert(request)
+        user_msg = result.messages[0]
+        assert user_msg["role"] == "user"
+        parts = user_msg["content"]
+        assert len(parts) == 2
+        assert parts[0] == {"type": "text", "text": "Describe this image"}
+        assert parts[1] == {
+            "type": "image_url",
+            "image_url": {"url": "data:image/jpeg;base64,iVBORw0KGgo="},
+        }
+    def test_url_image_in_user_message(self):
+        request = _make_request(
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is this?"},
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "url",
+                                "url": "https://example.com/cat.png",
+                            },
+                        },
+                    ],
+                }
+            ]
+        )
+        result = _convert(request)
+        parts = result.messages[0]["content"]
+        assert parts[1] == {
+            "type": "image_url",
+            "image_url": {"url": "https://example.com/cat.png"},
+        }
+# ======================================================================
+# tool_result content handling
+# ======================================================================
+class TestToolResultContent:
+    def _make_tool_result_request(
+        self, tool_result_content
+    ) -> AnthropicMessagesRequest:
+        """Build a request with assistant tool_use followed by user
+        tool_result."""
+        return _make_request(
+            [
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "tool_use",
+                            "id": "call_001",
+                            "name": "read_file",
+                            "input": {"path": "/tmp/img.png"},
+                        }
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "tool_result",
+                            "tool_use_id": "call_001",
+                            "content": tool_result_content,
+                        }
+                    ],
+                },
+            ]
+        )
+    def test_tool_result_string_content(self):
+        request = self._make_tool_result_request("file contents here")
+        result = _convert(request)
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == "file contents here"
+        assert tool_msg[0]["tool_call_id"] == "call_001"
+    def test_tool_result_text_blocks(self):
+        request = self._make_tool_result_request(
+            [
+                {"type": "text", "text": "line 1"},
+                {"type": "text", "text": "line 2"},
+            ]
+        )
+        result = _convert(request)
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == "line 1\nline 2"
+    def test_tool_result_with_image(self):
+        """Image in tool_result should produce a follow-up user message."""
+        request = self._make_tool_result_request(
+            [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": "AAAA",
+                    },
+                }
+            ]
+        )
+        result = _convert(request)
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == ""
+        # The image should be injected as a follow-up user message
+        follow_up = [
+            m
+            for m in result.messages
+            if m["role"] == "user" and isinstance(m.get("content"), list)
+        ]
+        assert len(follow_up) == 1
+        img_parts = follow_up[0]["content"]
+        assert len(img_parts) == 1
+        assert img_parts[0] == {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,AAAA"},
+        }
+    def test_tool_result_with_text_and_image(self):
+        """Mixed text+image tool_result: text in tool msg, image in user
+        msg."""
+        request = self._make_tool_result_request(
+            [
+                {"type": "text", "text": "Here is the screenshot"},
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": "QUFB",
+                    },
+                },
+            ]
+        )
+        result = _convert(request)
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == "Here is the screenshot"
+        follow_up = [
+            m
+            for m in result.messages
+            if m["role"] == "user" and isinstance(m.get("content"), list)
+        ]
+        assert len(follow_up) == 1
+        assert follow_up[0]["content"][0]["image_url"]["url"] == (
+            "data:image/jpeg;base64,QUFB"
+        )
+    def test_tool_result_with_multiple_images(self):
+        request = self._make_tool_result_request(
+            [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": "IMG1",
+                    },
+                },
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "url",
+                        "url": "https://example.com/img2.jpg",
+                    },
+                },
+            ]
+        )
+        result = _convert(request)
+        follow_up = [
+            m
+            for m in result.messages
+            if m["role"] == "user" and isinstance(m.get("content"), list)
+        ]
+        assert len(follow_up) == 1
+        urls = [p["image_url"]["url"] for p in follow_up[0]["content"]]
+        assert urls == [
+            "data:image/png;base64,IMG1",
+            "https://example.com/img2.jpg",
+        ]
+    def test_tool_result_none_content(self):
+        request = self._make_tool_result_request(None)
+        result = _convert(request)
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == ""
+    def test_tool_result_no_follow_up_when_no_images(self):
+        """Ensure no extra user message is added when there are no images."""
+        request = self._make_tool_result_request(
+            [
+                {"type": "text", "text": "just text"},
+            ]
+        )
+        result = _convert(request)
+        user_follow_ups = [
+            m
+            for m in result.messages
+            if m["role"] == "user" and isinstance(m.get("content"), list)
+        ]
+        assert len(user_follow_ups) == 0
+# ======================================================================
+# Attribution header stripping
+# ======================================================================
+class TestAttributionHeaderStripping:
+    def test_billing_header_stripped_from_system(self):
+        """Claude Code's x-anthropic-billing-header block should be
+        stripped to preserve prefix caching."""
+        request = _make_request(
+            [{"role": "user", "content": "Hello"}],
+            system=[
+                {"type": "text", "text": "You are a helpful assistant."},
+                {
+                    "type": "text",
+                    "text": "x-anthropic-billing-header: "
+                    "cc_version=2.1.37.abc; cc_entrypoint=cli;",
+                },
+            ],
+        )
+        result = _convert(request)
+        system_msg = result.messages[0]
+        assert system_msg["role"] == "system"
+        assert system_msg["content"] == "You are a helpful assistant."
+    def test_system_without_billing_header_unchanged(self):
+        """Normal system blocks should pass through unchanged."""
+        request = _make_request(
+            [{"role": "user", "content": "Hello"}],
+            system=[
+                {"type": "text", "text": "You are a helpful assistant."},
+                {"type": "text", "text": " Be concise."},
+            ],
+        )
+        result = _convert(request)
+        system_msg = result.messages[0]
+        assert system_msg["content"] == "You are a helpful assistant. Be concise."
+    def test_system_string_unchanged(self):
+        """String system prompts should pass through unchanged."""
+        request = _make_request(
+            [{"role": "user", "content": "Hello"}],
+            system="You are a helpful assistant.",
+        )
+        result = _convert(request)
+        system_msg = result.messages[0]
+        assert system_msg["content"] == "You are a helpful assistant."
+# ======================================================================
+# Thinking block conversion (Anthropic → OpenAI)
+# ======================================================================
+class TestThinkingBlockConversion:
+    """Verify that thinking blocks in assistant messages are correctly
+    moved to the ``reasoning`` field and stripped from ``content`` during
+    the Anthropic→OpenAI conversion.
+    This is the Anthropic-endpoint path: the client echoes back the full
+    assistant message (including thinking blocks emitted by vllm) in
+    subsequent requests.
+    """
+    def test_thinking_plus_text_in_assistant_message(self):
+        """thinking + text → reasoning field + plain-string content."""
+        request = _make_request(
+            [
+                {"role": "user", "content": "Write me some code."},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "thinking",
+                            "thinking": "I should write a simple example.",
+                            "signature": "sig_abc123",
+                        },
+                        {"type": "text", "text": "Sure! Here is the code."},
+                    ],
+                },
+                {"role": "user", "content": "Can you fix the bug?"},
+            ]
+        )
+        result = _convert(request)
+        # Find the assistant message in the converted output.
+        asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
+        assert len(asst_msgs) == 1
+        asst = asst_msgs[0]
+        # Thinking content must be in reasoning, NOT in content.
+        assert asst.get("reasoning") == "I should write a simple example."
+        assert asst.get("content") == "Sure! Here is the code."
+    def test_thinking_only_in_assistant_message(self):
+        """Assistant message with only a thinking block (no visible text).
+        This can happen when the model emits reasoning but no final answer
+        yet (e.g. a mid-turn reasoning step).  Content should be None.
+        """
+        request = _make_request(
+            [
+                {"role": "user", "content": "Hello"},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "thinking",
+                            "thinking": "Just thinking...",
+                            "signature": "sig_xyz",
+                        }
+                    ],
+                },
+                {"role": "user", "content": "Go on."},
+            ]
+        )
+        result = _convert(request)
+        asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
+        assert len(asst_msgs) == 1
+        asst = asst_msgs[0]
+        assert asst.get("reasoning") == "Just thinking..."
+        # No visible text → content should be absent or None.
+        assert asst.get("content") is None
+    def test_thinking_plus_tool_use_in_assistant_message(self):
+        """thinking + tool_use: reasoning field set, tool_calls populated."""
+        request = _make_request(
+            [
+                {"role": "user", "content": "What is 2+2?"},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "thinking",
+                            "thinking": "I need to call the calculator.",
+                            "signature": "sig_tool",
+                        },
+                        {
+                            "type": "tool_use",
+                            "id": "call_001",
+                            "name": "calculator",
+                            "input": {"expression": "2+2"},
+                        },
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "tool_result",
+                            "tool_use_id": "call_001",
+                            "content": "4",
+                        }
+                    ],
+                },
+            ]
+        )
+        result = _convert(request)
+        asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
+        assert len(asst_msgs) == 1
+        asst = asst_msgs[0]
+        assert asst.get("reasoning") == "I need to call the calculator."
+        tool_calls = list(asst.get("tool_calls", []))
+        assert len(tool_calls) == 1
+        assert tool_calls[0]["function"]["name"] == "calculator"
+        # No text content alongside reasoning + tool_use.
+        assert asst.get("content") is None
+    def test_multiple_thinking_blocks_concatenated(self):
+        """Multiple thinking blocks should be joined in order."""
+        request = _make_request(
+            [
+                {"role": "user", "content": "Think hard."},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "thinking",
+                            "thinking": "First thought. ",
+                            "signature": "s1",
+                        },
+                        {
+                            "type": "thinking",
+                            "thinking": "Second thought.",
+                            "signature": "s2",
+                        },
+                        {"type": "text", "text": "Done."},
+                    ],
+                },
+            ]
+        )
+        result = _convert(request)
+        asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
+        assert len(asst_msgs) == 1
+        asst = asst_msgs[0]
+        assert asst.get("reasoning") == "First thought. Second thought."
+        assert asst.get("content") == "Done."
+    def test_no_thinking_blocks_unchanged(self):
+        """Messages without thinking blocks must not be modified."""
+        request = _make_request(
+            [
+                {"role": "user", "content": "Hi"},
+                {"role": "assistant", "content": "Hello!"},
+            ]
+        )
+        result = _convert(request)
+        asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
+        assert len(asst_msgs) == 1
+        asst = asst_msgs[0]
+        assert asst.get("content") == "Hello!"
+        assert "reasoning" not in asst
+    def test_multi_turn_with_thinking_blocks(self):
+        """Full multi-turn conversation: previous assistant messages that
+        include thinking blocks must all be converted without a 400 error.
+        This is the primary regression scenario from the bug report:
+        upgrading vllm from v0.15.1 → v0.17.0 introduced thinking-block
+        support in responses, but echoing those responses back in subsequent
+        requests caused a Pydantic validation failure.
+        """
+        request = _make_request(
+            [
+                {"role": "user", "content": "Turn 1 question"},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "thinking",
+                            "thinking": "Reasoning for turn 1.",
+                            "signature": "s_t1",
+                        },
+                        {"type": "text", "text": "Answer for turn 1."},
+                    ],
+                },
+                {"role": "user", "content": "Turn 2 question"},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "thinking",
+                            "thinking": "Reasoning for turn 2.",
+                            "signature": "s_t2",
+                        },
+                        {"type": "text", "text": "Answer for turn 2."},
+                    ],
+                },
+                {"role": "user", "content": "Turn 3 question"},
+            ]
+        )
+        # Must not raise a ValidationError / 400.
+        result = _convert(request)
+        asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
+        assert len(asst_msgs) == 2
+        assert asst_msgs[0].get("reasoning") == "Reasoning for turn 1."
+        assert asst_msgs[0].get("content") == "Answer for turn 1."
+        assert asst_msgs[1].get("reasoning") == "Reasoning for turn 2."
+        assert asst_msgs[1].get("content") == "Answer for turn 2."
+    def test_redacted_thinking_block_is_accepted(self):
+        """Anthropic clients may echo back redacted thinking blocks.
+        vLLM should accept these blocks (to avoid 400 validation errors)
+        and ignore them when constructing the OpenAI-format prompt.
+        """
+        request = _make_request(
+            [
+                {"role": "user", "content": "Hello"},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "thinking",
+                            "thinking": "Thinking...",
+                            "signature": "sig_think",
+                        },
+                        {
+                            "type": "redacted_thinking",
+                            "data": "BASE64_OR_OTHER_OPAQUE_DATA",
+                        },
+                        {"type": "text", "text": "Hi!"},
+                    ],
+                },
+                {"role": "user", "content": "Continue"},
+            ]
+        )
+        result = _convert(request)
+        asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
+        assert len(asst_msgs) == 1
+        asst = asst_msgs[0]
+        # Redacted thinking is ignored, normal thinking still becomes reasoning.
+        assert asst.get("reasoning") == "Thinking..."
+        assert asst.get("content") == "Hi!"
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -145,6 +145,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
                model=MODEL_NAME,
                max_tokens=10000,
                extra_body={"min_tokens": 10000},
+                temperature=0.0,
            )
        )
        tasks.append(task)
@@ -163,7 +164,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
    # be able to respond to this one within the timeout
    client = server.get_async_client(timeout=5)
    response = await client.chat.completions.create(
-        messages=chat_input, model=MODEL_NAME, max_tokens=10
+        messages=chat_input, model=MODEL_NAME, max_tokens=10, temperature=0.0
    )
    assert len(response.choices) == 1

--- a/tests/entrypoints/instrumentator/test_metrics.py
+++ b/tests/entrypoints/instrumentator/test_metrics.py
@@ -17,6 +17,7 @@ from transformers import AutoTokenizer
 from tests.conftest import LocalAssetServer
 from tests.utils import RemoteOpenAIServer
 from vllm import version
+from vllm.utils.network_utils import get_open_port
 MODELS = {
    "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -315,14 +316,26 @@ async def test_abort_metrics_reset(
            client.completions.create(
                model=model_name,
                prompt=prompt_ids,
-                max_tokens=100,  # Long generation to give time to abort
+                max_tokens=500,  # Long generation to give time to abort
                temperature=0.0,
            )
        )
        tasks.append(task)
-    # Wait a bit for requests to start processing
+    # Poll until we see running requests rather than using a fixed sleep,
-    await asyncio.sleep(0.5)
+    # since generation speed varies across hardware.
+    try:
+        await _poll_until(
+            lambda: _get_running_metrics_from_api(server)[0] > 0,
+            timeout=10.0,
+            interval=0.1,
+            description="running_requests > 0",
+        )
+    except TimeoutError:
+        for task in tasks:
+            task.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
+        pytest.fail("Requests never appeared as running in metrics")
    # Check that we have running requests
    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
@@ -336,13 +349,15 @@ async def test_abort_metrics_reset(
    # Cancel all tasks to abort the requests
    for task in tasks:
        task.cancel()
+    await asyncio.gather(*tasks, return_exceptions=True)
-    # Wait for cancellations to be processed
-    await asyncio.sleep(1.0)
+    # Poll until metrics reset rather than using a fixed sleep
+    await _poll_until(
-    # Check that metrics have reset to zero
+        lambda: _get_running_metrics_from_api(server) == (0, 0, 0),
-    response = requests.get(server.url_for("metrics"))
+        timeout=10.0,
-    assert response.status_code == HTTPStatus.OK
+        interval=0.2,
+        description="gauge metrics back to zero",
+    )
    # Verify running and waiting requests counts and KV cache usage are zero
    running_requests_after, waiting_requests_after, kv_cache_usage_after = (
@@ -360,6 +375,18 @@ async def test_abort_metrics_reset(
    )
+async def _poll_until(
+    predicate, *, timeout: float, interval: float = 0.5, description: str = "condition"
+):
+    """Poll until predicate() returns True, or raise TimeoutError."""
+    start = time.time()
+    while time.time() - start < timeout:
+        if predicate():
+            return
+        await asyncio.sleep(interval)
+    raise TimeoutError(f"Timed out after {timeout}s waiting for: {description}")
 def _get_running_metrics_from_api(server: RemoteOpenAIServer):
    """Return (running_count, waiting_count, kv_cache_usage)"""
@@ -399,7 +426,7 @@ def test_metrics_exist_run_batch():
    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
    base_url = "0.0.0.0"
-    port = "8001"
+    port = str(get_open_port())
    server_url = f"http://{base_url}:{port}"
    with (
@@ -420,24 +447,39 @@ def test_metrics_exist_run_batch():
                "--model",
                "intfloat/multilingual-e5-small",
                "--enable-metrics",
-                "--url",
+                "--host",
                base_url,
                "--port",
                port,
            ],
        )
-        def is_server_up(url):
+        try:
+            def is_server_up(url):
+                try:
+                    response = requests.get(url)
+                    return response.status_code == 200
+                except requests.ConnectionError:
+                    return False
+            start = time.time()
+            timeout = 120
+            while not is_server_up(server_url):
+                if proc.poll() is not None:
+                    pytest.fail(
+                        f"Batch process exited early with returncode={proc.returncode}"
+                    )
+                if time.time() - start > timeout:
+                    pytest.fail("Batch server did not start within timeout")
+                time.sleep(1)
+            response = requests.get(server_url + "/metrics")
+            assert response.status_code == HTTPStatus.OK
+        finally:
+            proc.terminate()
            try:
-                response = requests.get(url)
+                proc.wait(timeout=15)
-                return response.status_code == 200
+            except subprocess.TimeoutExpired:
-            except requests.ConnectionError:
+                proc.kill()
-                return False
+                proc.wait(timeout=5)
-        while not is_server_up(server_url):
-            time.sleep(1)
-        response = requests.get(server_url + "/metrics")
-        assert response.status_code == HTTPStatus.OK
-        proc.wait()
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
--- a/tests/entrypoints/sleep/test_sleep.py
+++ b/tests/entrypoints/sleep/test_sleep.py
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -195,18 +195,15 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
    valid_msg = [{"role": "user", "content": "Hello"}]
    long_text = "This is a very long text to test the error " * 50
    invalid_msg = [{"role": "user", "content": long_text}]
-    batch_1 = [
-        valid_msg,
+    batch_1 = [valid_msg, valid_msg, invalid_msg]
-        valid_msg,
+    batch_2 = [valid_msg, valid_msg]
-        invalid_msg,
-    ]
-    batch_2 = [
-        valid_msg,
-        valid_msg,
-    ]
    sampling_params = SamplingParams(temperature=0, max_tokens=10)
-    with pytest.raises(ValueError, match="context length is only"):
+    with pytest.raises(ValueError, match="maximum context length is"):
        llm.chat(batch_1, sampling_params=sampling_params)
+    assert llm.llm_engine.get_num_unfinished_requests() == 0
    outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
    assert len(outputs_2) == len(batch_2)
    assert llm.llm_engine.get_num_unfinished_requests() == 0
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -13,7 +13,7 @@ from ...utils import create_new_process_for_each_test
 @pytest.mark.parametrize("backend", ["mp", "ray"])
 @create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend, monkeypatch):
-    if torch.cuda.device_count() < tp_size:
+    if torch.accelerator.device_count() < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
    if tp_size == 1 and backend == "ray":
        pytest.skip("Skip duplicate test case")

--- a/vllm/entrypoints/openai/basic/__init__.py
+++ b/vllm/entrypoints/openai/basic/__init__.py
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -3,6 +3,7 @@
 # imports for structured outputs tests
 import json
+from collections import defaultdict
 import jsonschema
 import openai  # use the official client for correctness check
@@ -13,7 +14,11 @@ import requests
 import torch
 from openai import BadRequestError
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.sampling_params import SamplingParams
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -815,3 +820,203 @@ async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenA
    assert chat_output.keys() == invocation_output.keys()
    assert chat_output["choices"] == invocation_output["choices"]
+# Test n parameter for chat completions
+# Tests that the n parameter works correctly for regular sampling
+# (non-beam search) in chat completions, addressing issue #34305.
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_chat_completion_n_parameter_non_streaming(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    """Test that n parameter returns multiple choices for non-streaming requests."""
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the opposite of big?"},
+    ]
+    # Test with n=3
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=20,
+        temperature=0.7,
+        n=3,
+        stream=False,
+    )
+    assert len(chat_completion.choices) == 3
+    # Verify each choice has content and correct index
+    for i, choice in enumerate(chat_completion.choices):
+        assert choice.index == i
+        assert choice.message.content is not None
+        assert len(choice.message.content) > 0
+    # Verify all responses are different (highly likely with temperature > 0)
+    contents = [choice.message.content for choice in chat_completion.choices]
+    assert len(set(contents)) > 1, "Expected different responses with n=3"
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_chat_completion_n_parameter_streaming(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    """Test that n parameter returns multiple choices for streaming requests."""
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=15,
+        temperature=0.7,
+        n=2,
+        stream=True,
+    )
+    # Collect all chunks using defaultdict for dynamic handling
+    chunks_by_index = defaultdict(list)
+    async for chunk in stream:
+        for choice in chunk.choices:
+            if choice.delta.content:
+                chunks_by_index[choice.index].append(choice.delta.content)
+    # Verify both choices received content
+    assert len(chunks_by_index[0]) > 0, "Choice 0 received no content chunks"
+    assert len(chunks_by_index[1]) > 0, "Choice 1 received no content chunks"
+    # Reconstruct full responses
+    response_0 = "".join(chunks_by_index[0])
+    response_1 = "".join(chunks_by_index[1])
+    assert len(response_0) > 0, "Choice 0 has empty response"
+    assert len(response_1) > 0, "Choice 1 has empty response"
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_chat_completion_n_with_seed(client: openai.AsyncOpenAI, model_name: str):
+    """Test that n parameter works correctly with seed parameter."""
+    messages = [
+        {"role": "user", "content": "Say hello."},
+    ]
+    # Test that seed parameter is accepted and works with n > 1
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.8,
+        n=2,
+        seed=42,
+        stream=False,
+    )
+    # Verify we get n=2 choices
+    assert len(chat_completion.choices) == 2
+    # Verify both choices have valid content
+    for i, choice in enumerate(chat_completion.choices):
+        assert choice.index == i
+        assert choice.message.content is not None
+        assert len(choice.message.content) > 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_chat_completion_n_equals_1(client: openai.AsyncOpenAI, model_name: str):
+    """Test that n=1 (default) still works correctly."""
+    messages = [
+        {"role": "user", "content": "Hello!"},
+    ]
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.7,
+        n=1,
+        stream=False,
+    )
+    assert len(chat_completion.choices) == 1
+    assert chat_completion.choices[0].index == 0
+    assert chat_completion.choices[0].message.content is not None
+# Unit tests for n parameter in ChatCompletionRequest.to_sampling_params()
+def test_chat_completion_request_n_parameter_to_sampling_params():
+    """Test that n parameter is correctly passed to SamplingParams."""
+    # Test with n=3
+    request = ChatCompletionRequest(
+        model="test-model",
+        messages=[{"role": "user", "content": "Hello"}],
+        n=3,
+        max_tokens=10,
+    )
+    sampling_params = request.to_sampling_params(
+        max_tokens=10,
+        default_sampling_params={},
+    )
+    assert isinstance(sampling_params, SamplingParams)
+    assert sampling_params.n == 3, f"Expected n=3, got n={sampling_params.n}"
+def test_chat_completion_request_n_parameter_default():
+    """Test that n parameter defaults to 1."""
+    request = ChatCompletionRequest(
+        model="test-model",
+        messages=[{"role": "user", "content": "Hello"}],
+        # n not specified, should default to 1
+        max_tokens=10,
+    )
+    assert request.n == 1, "n should default to 1"
+    sampling_params = request.to_sampling_params(
+        max_tokens=10,
+        default_sampling_params={},
+    )
+    # SamplingParams.from_optional converts None to 1
+    assert sampling_params.n == 1, f"Expected n=1 (default), got n={sampling_params.n}"
+def test_chat_completion_request_n_parameter_various_values():
+    """Test n parameter with various values."""
+    for n_value in [1, 2, 5, 10]:
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "Test"}],
+            n=n_value,
+            max_tokens=10,
+        )
+        sampling_params = request.to_sampling_params(
+            max_tokens=10,
+            default_sampling_params={},
+        )
+        assert sampling_params.n == n_value, (
+            f"Expected n={n_value}, got n={sampling_params.n}"
+        )
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -7,10 +7,9 @@ import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+from tests.utils import RemoteOpenAIServer
 from vllm.config import ModelConfig
-from ...utils import RemoteOpenAIServer
 # # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"

--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -2,18 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
-from http import HTTPStatus
 from typing import Any
-from unittest.mock import AsyncMock, MagicMock
+from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
-from vllm.entrypoints.openai.engine.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import GenerationError
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
+from vllm.entrypoints.serve.render.serving import OpenAIServingRender
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.renderers.hf import HfRenderer
 from vllm.tokenizers.registry import tokenizer_args_from_config
@@ -44,7 +44,7 @@ class MockModelConfig:
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
-    logits_processor_pattern = None
+    hf_text_config = MockHFConfig()
    logits_processors: list[str] | None = None
    diff_sampling_param: dict | None = None
    allowed_local_media_path: str = ""
@@ -54,16 +54,28 @@ class MockModelConfig:
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    skip_tokenizer_init = False
    is_encoder_decoder: bool = False
+    is_multimodal_model: bool = False
    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}
+@dataclass
+class MockParallelConfig:
+    _api_process_rank: int = 0
+@dataclass
+class MockVllmConfig:
+    model_config: MockModelConfig
+    parallel_config: MockParallelConfig
 def _build_renderer(model_config: MockModelConfig):
    _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)
-    return HfRenderer(
+    return HfRenderer.from_config(
-        model_config,
+        MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
        tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
    )
@@ -73,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
        engine_client=engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
+    serving_render = OpenAIServingRender(
+        model_config=engine.model_config,
+        renderer=engine.renderer,
+        io_processor=engine.io_processor,
+        model_registry=models.registry,
+        request_logger=None,
+        chat_template=None,
+        chat_template_content_format="auto",
+    )
    serving_chat = OpenAIServingChat(
        engine,
        models,
        response_role="assistant",
+        openai_serving_render=serving_render,
        request_logger=None,
        chat_template=None,
        chat_template_content_format="auto",
@@ -89,7 +111,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
            [{"prompt_token_ids": [1, 2, 3]}],
        )
-    serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
+    serving_chat.openai_serving_render._preprocess_chat = AsyncMock(
+        side_effect=_fake_preprocess_chat
+    )
    return serving_chat
@@ -139,12 +163,8 @@ async def test_chat_error_non_stream():
        stream=False,
    )
-    response = await serving_chat.create_chat_completion(request)
+    with pytest.raises(GenerationError):
+        await serving_chat.create_chat_completion(request)
-    assert isinstance(response, ErrorResponse)
-    assert response.error.type == "InternalServerError"
-    assert response.error.message == "Internal server error"
-    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
 @pytest.mark.asyncio
@@ -227,3 +247,152 @@ async def test_chat_error_stream():
        f"Expected error message in chunks: {chunks}"
    )
    assert chunks[-1] == "data: [DONE]\n\n"
+@pytest.mark.parametrize(
+    "image_content",
+    [
+        [{"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}],
+        [{"image_url": {"url": "https://example.com/image.jpg"}}],
+    ],
+)
+def test_system_message_warns_on_image(image_content):
+    """Test that system messages with image content trigger a warning."""
+    with patch(
+        "vllm.entrypoints.openai.chat_completion.protocol.logger"
+    ) as mock_logger:
+        ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": image_content,
+                }
+            ],
+        )
+    mock_logger.warning_once.assert_called()
+    call_args = str(mock_logger.warning_once.call_args)
+    assert "System messages should only contain text" in call_args
+    assert "image_url" in call_args
+def test_system_message_accepts_text():
+    """Test that system messages can contain text content."""
+    # Should not raise an exception
+    request = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+        ],
+    )
+    assert request.messages[0]["role"] == "system"
+def test_system_message_accepts_text_array():
+    """Test that system messages can contain an array with text content."""
+    # Should not raise an exception
+    request = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}],
+            },
+        ],
+    )
+    assert request.messages[0]["role"] == "system"
+def test_user_message_accepts_image():
+    """Test that user messages can still contain image content."""
+    # Should not raise an exception
+    request = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": "https://example.com/image.jpg"},
+                    },
+                ],
+            },
+        ],
+    )
+    assert request.messages[0]["role"] == "user"
+@pytest.mark.parametrize(
+    "audio_content",
+    [
+        [
+            {
+                "type": "input_audio",
+                "input_audio": {"data": "base64data", "format": "wav"},
+            }
+        ],
+        [{"input_audio": {"data": "base64data", "format": "wav"}}],
+    ],
+)
+def test_system_message_warns_on_audio(audio_content):
+    """Test that system messages with audio content trigger a warning."""
+    with patch(
+        "vllm.entrypoints.openai.chat_completion.protocol.logger"
+    ) as mock_logger:
+        ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": audio_content,
+                }
+            ],
+        )
+    mock_logger.warning_once.assert_called()
+    call_args = str(mock_logger.warning_once.call_args)
+    assert "System messages should only contain text" in call_args
+    assert "input_audio" in call_args
+@pytest.mark.parametrize(
+    "video_content",
+    [
+        [{"type": "video_url", "video_url": {"url": "https://example.com/video.mp4"}}],
+        [{"video_url": {"url": "https://example.com/video.mp4"}}],
+    ],
+)
+def test_system_message_warns_on_video(video_content):
+    """Test that system messages with video content trigger a warning."""
+    with patch(
+        "vllm.entrypoints.openai.chat_completion.protocol.logger"
+    ) as mock_logger:
+        ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": video_content,
+                }
+            ],
+        )
+    mock_logger.warning_once.assert_called()
+    call_args = str(mock_logger.warning_once.call_args)
+    assert "System messages should only contain text" in call_args
+    assert "video_url" in call_args
+def test_json_schema_response_format_missing_schema():
+    """When response_format type is 'json_schema' but the json_schema field
+    is not provided, request construction should raise a validation error
+    so the API returns 400 instead of 500."""
+    with pytest.raises(Exception, match="json_schema.*must be provided"):
+        ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": "hello"}],
+            response_format={"type": "json_schema"},
+        )
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -5,10 +5,9 @@ import openai
 import pytest
 import pytest_asyncio
+from tests.utils import RemoteOpenAIServer
 from vllm.config import ModelConfig
-from ...utils import RemoteOpenAIServer
 MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -5,7 +5,7 @@ import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 # a reasoning and tool calling model
 MODEL_NAME = "Qwen/QwQ-32B"

--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -10,11 +10,12 @@ import pytest
 import pytest_asyncio
 # downloading lora to test lora requests
-from ...utils import RemoteOpenAIServer
+from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
 # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen3-0.6B"
 tools = [
    {
        "type": "function",
@@ -139,9 +140,12 @@ def server():
        "qwen3",
        "--gpu-memory-utilization",
        "0.4",
-    ]
+        "--enforce-eager",
+    ] + ROCM_EXTRA_ARGS
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict=ROCM_ENV_OVERRIDES
+    ) as remote_server:
        yield remote_server
@@ -226,12 +230,13 @@ def k2_server():
        "qwen3",
        "--gpu-memory-utilization",
        "0.4",
-    ]
+    ] + ROCM_EXTRA_ARGS
    # hack to test kimi_k2 tool use tool_id format.
    # avoid error in is_deepseek_mla check by setting kv_lora_rank=null
    with RemoteOpenAIServer(
        MODEL_NAME,
        args,
+        env_dict=ROCM_ENV_OVERRIDES,
        override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
    ) as remote_server:
        yield remote_server
@@ -294,7 +299,10 @@ async def test_no_args_tool_call(
            "type": "function",
            "function": {
                "name": "get_current_time",
-                "description": "Get the current date and time. No parameters needed.",
+                "description": (
+                    "Get the current date and time. Call this when the user "
+                    "asks what time or date it is. No parameters needed."
+                ),
                "parameters": {
                    "type": "object",
                    "properties": {},  # No parameters
@@ -303,10 +311,28 @@ async def test_no_args_tool_call(
            },
        }
    ]
-    messages = [{"role": "user", "content": "What time is it now?"}]
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are a helpful assistant. Always use the available tools "
+                "when relevant, and reply with a short sentence after "
+                "receiving a tool result."
+            ),
+        },
+        {"role": "user", "content": "What time is it now?"},
+    ]
+    shared_kwargs = dict(
+        model=model_name,
+        temperature=0.0,
+        seed=42,
+        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
+    )
    # Step 2: Send user message and let model decide whether to call the tool
    response = await client.chat.completions.create(
-        model=model_name,
+        **shared_kwargs,
        messages=messages,
        tools=tools,
        tool_choice="auto",  # Let model choose automatically
@@ -334,11 +360,15 @@ async def test_no_args_tool_call(
            )
            # Step 5: Send tool result back to model to continue conversation
            final_response = await client.chat.completions.create(
-                model=model_name,
+                **shared_kwargs,
                messages=messages,
+                max_completion_tokens=128,
            )
            # Output final natural language response
-            assert final_response.choices[0].message.content is not None
+            assert (
+                final_response.choices[0].message.content is not None
+                and final_response.choices[0].message.content.strip() != ""
+            )
    else:
        # No tool called — just print model's direct reply
@@ -484,3 +514,27 @@ async def test_inconsistent_tool_choice_and_tools(
            ],
            tool_choice={},
        )
+@pytest.mark.asyncio
+async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
+    """ """
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    # This combination previously crashed the engine
+    chat_completion = await client.chat.completions.create(
+        messages=messages,
+        temperature=0,
+        max_completion_tokens=1,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+    )
+    # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`,
+    # both `tool_calls` and `content` should be empty.
+    # This behavior should be consistent with OpenAI.
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert len(choice.message.tool_calls) == 0
+    assert choice.message.content == ""
--- a/tests/entrypoints/openai/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -4,7 +4,7 @@ import openai
 import pytest
 import pytest_asyncio
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 @pytest.fixture(scope="module")