Merge tag 'v0.10.0' into v0.10.0-dev

711aa9d5 · zhuwenwen · 751c492c · 6d8d0a24 · 711aa9d5 · 711aa9d5
Commit 711aa9d5 authored Jul 30, 2025 by zhuwenwen
20 changed files
--- a/tests/v1/entrypoints/openai/responses/conftest.py
+++ b/tests/v1/entrypoints/openai/responses/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import pytest_asyncio
+from tests.utils import RemoteOpenAIServer
+# Use a small reasoning model to test the responses API.
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",  # For faster startup.
+        "--reasoning-parser",
+        "deepseek_r1",
+    ]
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
--- a/tests/v1/entrypoints/openai/responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/responses/test_basic.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import openai  # use the official client for correctness check
+import pytest
+@pytest.mark.asyncio
+async def test_simple_input(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input="What is 13 * 24?")
+    print(response)
+    outputs = response.output
+    # Whether the output contains the answer.
+    assert outputs[-1].type == "message"
+    assert "312" in outputs[-1].content[0].text
+    # Whether the output contains the reasoning.
+    assert outputs[0].type == "reasoning"
+    assert outputs[0].text != ""
+@pytest.mark.asyncio
+async def test_instructions(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        instructions="Finish the answer with QED.",
+        input="What is 13 * 24?",
+    )
+    print(response)
+    output_text = response.output[-1].content[0].text
+    assert "312" in output_text
+    assert "QED" in output_text
+@pytest.mark.asyncio
+async def test_chat(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input=[
+        {
+            "role": "system",
+            "content": "Finish the answer with QED."
+        },
+        {
+            "role": "user",
+            "content": "What is 5 * 3?"
+        },
+        {
+            "role": "assistant",
+            "content": "15. QED."
+        },
+        {
+            "role": "user",
+            "content": "Multiply the result by 2."
+        },
+    ], )
+    print(response)
+    output_text = response.output[-1].content[0].text
+    assert "30" in output_text
+    assert "QED" in output_text
+@pytest.mark.asyncio
+async def test_chat_with_input_type(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input=[
+        {
+            "role": "user",
+            "content": [{
+                "type": "input_text",
+                "text": "Hello!"
+            }],
+        },
+    ], )
+    print(response)
+    assert response.status == "completed"
--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/responses/test_image.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import openai
+import pytest
+import pytest_asyncio
+from tests.utils import RemoteOpenAIServer
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+# Use a small vision model for testing
+MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
+MAXIMUM_IMAGES = 2
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+@pytest.fixture(scope="module")
+def default_image_server_args():
+    return [
+        "--enforce-eager",
+        "--max-model-len",
+        "6000",
+        "--max-num-seqs",
+        "128",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+@pytest.fixture(scope="module")
+def image_server(default_image_server_args):
+    with RemoteOpenAIServer(MODEL_NAME,
+                            default_image_server_args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(image_server):
+    async with image_server.get_async_client() as async_client:
+        yield async_client
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image(client: openai.AsyncOpenAI,
+                                         model_name: str, image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_image",
+                "image_url": image_url,
+                "detail": "auto",
+            },
+            {
+                "type": "input_text",
+                "text": content_text
+            },
+        ],
+    }]
+    # test image url
+    response = await client.responses.create(
+        model=model_name,
+        input=messages,
+    )
+    assert len(response.output_text) > 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_url: str,
+    base64_encoded_image: dict[str, str],
+):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_image",
+                "image_url":
+                f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
+                "detail": "auto",
+            },
+            {
+                "type": "input_text",
+                "text": content_text
+            },
+        ],
+    }]
+    # test image base64
+    response = await client.responses.create(
+        model=model_name,
+        input=messages,
+    )
+    assert len(response.output_text) > 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
+                                 image_urls: list[str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "input_image",
+                "image_url": image_url,
+                "detail": "auto",
+            } for image_url in image_urls),
+            {
+                "type": "input_text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    if len(image_urls) > MAXIMUM_IMAGES:
+        with pytest.raises(openai.BadRequestError):  # test multi-image input
+            await client.responses.create(
+                model=model_name,
+                input=messages,
+            )
+        # the server should still work afterwards
+        response = await client.responses.create(
+            model=model_name,
+            input=[{
+                "role": "user",
+                "content": "What's the weather like in Paris today?",
+            }],
+        )
+        assert len(response.output_text) > 0
+    else:
+        response = await client.responses.create(
+            model=model_name,
+            input=messages,
+        )
+        assert len(response.output_text) > 0
--- a/tests/v1/entrypoints/openai/responses/test_stateful.py
+++ b/tests/v1/entrypoints/openai/responses/test_stateful.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import openai
+import pytest
+@pytest.mark.asyncio
+async def test_store(client: openai.AsyncOpenAI):
+    # By default, store is True.
+    response = await client.responses.create(input="Hello!")
+    assert response.status == "completed"
+    # Retrieve the response.
+    response = await client.responses.retrieve(response.id)
+    assert response.status == "completed"
+    # Test store=False.
+    response = await client.responses.create(
+        input="Hello!",
+        store=False,
+    )
+    assert response.status == "completed"
+    # The response should not be found.
+    with pytest.raises(openai.NotFoundError,
+                       match="Response with id .* not found."):
+        await client.responses.retrieve(response.id)
+@pytest.mark.asyncio
+async def test_background(client: openai.AsyncOpenAI):
+    # NOTE: This query should be easy enough for the model to answer
+    # within the 10 seconds.
+    response = await client.responses.create(
+        input="Hello!",
+        background=True,
+    )
+    assert response.status == "queued"
+    max_retries = 10
+    for _ in range(max_retries):
+        await asyncio.sleep(1)
+        response = await client.responses.retrieve(response.id)
+        if response.status != "queued":
+            break
+    print(response)
+    assert response.status == "completed"
+@pytest.mark.asyncio
+async def test_background_error(client: openai.AsyncOpenAI):
+    with pytest.raises(
+            openai.BadRequestError,
+            match="background can only be used when `store` is true"):
+        _ = await client.responses.create(
+            input="What is 13 * 24?",
+            background=True,
+            store=False,
+        )
+@pytest.mark.asyncio
+async def test_background_cancel(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        input="Write a long story about a cat.",
+        background=True,
+    )
+    assert response.status == "queued"
+    # Cancel the response before it is completed.
+    # FIXME: This test can be flaky.
+    await asyncio.sleep(0.5)
+    response = await client.responses.cancel(response.id)
+    assert response.status == "cancelled"
+    # Make sure the response status remains unchanged.
+    await asyncio.sleep(5)
+    response = await client.responses.retrieve(response.id)
+    assert response.status == "cancelled"
+@pytest.mark.asyncio
+async def test_cancel_completed(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input="Hello")
+    assert response.status == "completed"
+    with pytest.raises(openai.BadRequestError,
+                       match="Cannot cancel a synchronous response."):
+        await client.responses.cancel(response.id)
+@pytest.mark.asyncio
+async def test_previous_response_id(client: openai.AsyncOpenAI):
+    response1 = await client.responses.create(
+        instructions="You are tested on your ability to retrieve the correct "
+        "information from the previous response.",
+        input="Hello, my name is John.")
+    response2 = await client.responses.create(
+        input="Actually, my name is not John. My real name is Mark.",
+        previous_response_id=response1.id,
+    )
+    response3 = await client.responses.create(
+        input="What is my real name again? Answer in one word.",
+        previous_response_id=response2.id,
+    )
+    print(response3)
+    assert "Mark" in response3.output[-1].content[0].text
+    assert "John" not in response3.output[-1].content[0].text
+@pytest.mark.asyncio
+async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
+    response1 = await client.responses.create(
+        instructions="You are tested on your ability to retrieve the correct "
+        "information from the previous response.",
+        input="Hello, my name is John.")
+    # Both response 2 and 3 use response 1 as the previous response.
+    response2 = client.responses.create(
+        input="Actually, my name is not John. My name is Mark.",
+        previous_response_id=response1.id,
+    )
+    response3 = client.responses.create(
+        input="What is my name again? Answer in one word.",
+        previous_response_id=response1.id,
+    )
+    _ = await response2
+    response3_result = await response3
+    print(response3_result)
+    assert "John" in response3_result.output[-1].content[0].text
+    assert "Mark" not in response3_result.output[-1].content[0].text
--- a/tests/v1/entrypoints/openai/responses/test_structured_output.py
+++ b/tests/v1/entrypoints/openai/responses/test_structured_output.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import openai
+import pytest
+from pydantic import BaseModel
+@pytest.mark.asyncio
+async def test_structured_output(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        input=[
+            {
+                "role": "system",
+                "content": "Extract the event information."
+            },
+            {
+                "role": "user",
+                "content":
+                "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "calendar_event",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "event_name": {
+                            "type": "string"
+                        },
+                        "date": {
+                            "type": "string"
+                        },
+                        "participants": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    "required": ["event_name", "date", "participants"],
+                    "additionalProperties": False,
+                },
+                "description": "A calendar event.",
+                "strict": True,
+            }
+        },
+    )
+    print(response)
+    # NOTE: The JSON schema is applied to the output text, not reasoning.
+    output_text = response.output[-1].content[0].text
+    event = json.loads(output_text)
+    assert event["event_name"].lower() == "science fair"
+    assert event["date"] == "Friday"
+    participants = event["participants"]
+    assert len(participants) == 2
+    assert participants[0] == "Alice"
+    assert participants[1] == "Bob"
+@pytest.mark.asyncio
+async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
+    class CalendarEvent(BaseModel):
+        event_name: str
+        date: str
+        participants: list[str]
+    response = await client.responses.parse(
+        model=None,
+        instructions="Extract the event information.",
+        input="Alice and Bob are going to a science fair on Friday.",
+        text_format=CalendarEvent,
+    )
+    print(response)
+    # The output is successfully parsed.
+    event = response.output_parsed
+    assert event is not None
+    # The output is correct.
+    assert event.event_name.lower() == "science fair"
+    assert event.date == "Friday"
+    participants = event.participants
+    assert len(participants) == 2
+    assert participants[0] == "Alice"
+    assert participants[1] == "Bob"
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -7,6 +7,7 @@ import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import regex as re
+import requests
 from openai import BadRequestError
 from tests.utils import RemoteOpenAIServer
@@ -26,7 +27,8 @@ def default_server_args():
        "2048",
        "--max-num-seqs",
        "128",
-        "--enforce-eager"
+        "--enforce-eager",
+        "--enable-prompt-tokens-details",
    ]
@@ -679,3 +681,17 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
            prompt=prompt,
            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
        )
+@pytest.mark.asyncio
+async def test_completion_with_empty_prompt_embeds(
+        client: openai.AsyncOpenAI) -> None:
+    """Test completion with empty prompt embeds."""
+    payload: dict[str, list] = {"prompt_embeds": []}
+    headers: dict[str, str] = {"Content-Type": "application/json"}
+    # base_url = http://localhost:8000/v1/completions
+    response = requests.post(f"{client.base_url}completions",
+                             headers=headers,
+                             json=payload)
+    assert response.status_code == 200, (
+        f"Expected status code 200, got {response.status_code}. ")
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -2,136 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import os
-import re
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-import requests
 from tests.utils import RemoteOpenAIServer
+from tests.v1.test_utils import check_request_balancing
 MODEL_NAME = "ibm-research/PowerMoE-3b"
 DP_SIZE = os.getenv("DP_SIZE", "1")
-def get_prometheus_metrics(
-        server: RemoteOpenAIServer) -> dict[str, dict[str, float]]:
-    """Fetch and parse Prometheus metrics from the /metrics endpoint.
-    Returns:
-        Dict mapping metric names to their values grouped by labels.
-        For example: {"vllm:request_success": {
-            "engine=0": 5.0, "engine=1": 3.0}
-        }
-    """
-    try:
-        response = requests.get(server.url_for("metrics"), timeout=10)
-        response.raise_for_status()
-        metrics: dict[str, dict[str, float]] = {}
-        # Regex patterns for Prometheus metrics
-        metric_with_labels = re.compile(
-            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$')
-        metric_simple = re.compile(
-            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$')
-        for line in response.text.split('\n'):
-            line = line.strip()
-            # Skip comments and empty lines
-            if not line or line.startswith('#'):
-                continue
-            # Try to match metric with labels first
-            match = metric_with_labels.match(line)
-            if match:
-                metric_name, labels_part, value_str = match.groups()
-                try:
-                    value = float(value_str)
-                    if metric_name not in metrics:
-                        metrics[metric_name] = {}
-                    metrics[metric_name][f'{{{labels_part}}}'] = value
-                except ValueError:
-                    continue
-            else:
-                # Try simple metric without labels
-                match = metric_simple.match(line)
-                if match:
-                    metric_name, value_str = match.groups()
-                    try:
-                        value = float(value_str)
-                        if metric_name not in metrics:
-                            metrics[metric_name] = {}
-                        metrics[metric_name][''] = value
-                    except ValueError:
-                        continue
-        return metrics
-    except Exception as e:
-        pytest.fail(f"Failed to fetch Prometheus metrics: {e}")
-        return {}
-def get_engine_request_counts(
-        metrics: dict[str, dict[str, float]]) -> dict[str, float]:
-    """Extract request counts per engine from Prometheus metrics.
-    Returns:
-        Dict mapping engine indices to request counts.
-        For example: {"0": 15.0, "1": 12.0}
-    """
-    engine_counts = {}
-    # Look for request success metrics with engine labels
-    success_metrics = metrics.get("vllm:request_success_total", {})
-    engine_pattern = re.compile(r'engine="([^"]*)"')
-    for labels, count in success_metrics.items():
-        # Extract engine ID from labels using regex
-        match = engine_pattern.search(labels)
-        if match:
-            engine_id = match.group(1)
-            if engine_id not in engine_counts:
-                engine_counts[engine_id] = 0.0
-            engine_counts[engine_id] += count
-    return engine_counts
-def check_request_balancing(server: RemoteOpenAIServer):
-    """Check request balancing via Prometheus metrics if DP_SIZE > 1.
-    Args:
-        server: The RemoteOpenAIServer instance
-    """
-    dp_size = int(DP_SIZE)
-    if dp_size <= 1:
-        return
-    # Get metrics after all requests are completed
-    metrics = get_prometheus_metrics(server)
-    engine_counts = get_engine_request_counts(metrics)
-    # Check that multiple engines received requests
-    engines_with_requests = [
-        engine for engine, count in engine_counts.items() if count > 0
-    ]
-    assert len(engines_with_requests) == dp_size, (
-        f"Expected requests to be distributed across multiple engines,"
-        f" but only engine(s) {engines_with_requests} received "
-        f"requests. Engine counts: {engine_counts}")
-    # Verify that the load is reasonably balanced
-    # (no engine should handle all requests)
-    total_requests = sum(engine_counts.values())
-    for count in engine_counts.values():
-        assert count > total_requests // (dp_size + 1), (
-            f"requests are imbalanced: {engine_counts}")
 @pytest.fixture(scope="module")
 def default_server_args():
    return [
@@ -217,7 +100,7 @@ async def test_single_completion(client: openai.AsyncOpenAI,
    assert all(completion is not None for completion in results)
    # Check request balancing via Prometheus metrics if DP_SIZE > 1
-    check_request_balancing(server)
+    check_request_balancing(server, int(DP_SIZE))
 @pytest.mark.asyncio
@@ -295,4 +178,4 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
    assert all(results), "Not all streaming requests completed successfully."
    # Check request balancing via Prometheus metrics if DP_SIZE > 1
-    check_request_balancing(server)
+    check_request_balancing(server, int(DP_SIZE))
--- a/vllm/attention/ops/blocksparse_attention/__init__.py
+++ b/vllm/attention/ops/blocksparse_attention/__init__.py
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -3,16 +3,10 @@
 import filecmp
 import shutil
 import tempfile
-from collections import defaultdict
 from pathlib import Path
 from vllm import LLM, SamplingParams
-from vllm.config import KVTransferConfig, VllmConfig
+from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
-    SharedStorageConnector)
-from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
@@ -25,62 +19,6 @@ PROMPTS = [
 SAMPLING_PARAMS = SamplingParams(temperature=0, max_tokens=20)
-class TestSharedStorageConnector(SharedStorageConnector):
-    def __init__(self, config: VllmConfig, role):
-        self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
-        self._connector = SharedStorageConnector(config, role)
-        self.call_record: dict[str, int] = defaultdict(int)
-        # Use a unique temp file per connector
-        self._event_file = tempfile.gettempdir(
-        ) + f"/connector_{self.name}-{self.role.name}_events.log"
-        # Start with an empty file
-        with open(self._event_file, "w") as _:
-            pass
-    def __getattribute__(self, name):
-        if name in ("_connector", "call_record", "name", "_event_file",
-                    "__class__", "__dict__", "__getattribute__",
-                    "__init__"):  # avoid recursion
-            return object.__getattribute__(self, name)
-        if not hasattr(self._connector, name):
-            return object.__getattribute__(self, name)
-        attr = getattr(self._connector, name)
-        # Intercept calls to the connector interface and write an event
-        # for each one to a file, which can be read back in the main test proc.
-        if callable(attr):
-            def wrapper(*args, **kwargs):
-                self.call_record[name] += 1
-                # Include args that we're interested in
-                to_log = [name]
-                for arg in args:
-                    if isinstance(arg, int):
-                        to_log.append(str(arg))
-                    elif isinstance(arg, KVCacheBlocks):
-                        to_log.append(
-                            f"num_blocks={[len(b) for b in arg.blocks]}")
-                # Log the event as a line to the file
-                try:
-                    with open(self._event_file, "a") as f:
-                        f.write(' '.join(to_log) + "\n")
-                except Exception as e:
-                    print(f"[ERROR] Could not log event {name} "
-                          f"for {self.name}: {e}")
-                return attr(*args, **kwargs)
-            return wrapper
-        return attr
-KVConnectorFactory.register_connector("TestSharedStorageConnector",
-                                      TestSharedStorageConnector.__module__,
-                                      TestSharedStorageConnector.__name__)
 # Helper function to compare directories recursively
 def _compare_directories(dir1: Path, dir2: Path) -> bool:
    """Compares two directories recursively for identical content."""
@@ -115,19 +53,27 @@ def test_multi_shared_storage_connector_consistency():
        kv_role="kv_both",
        kv_connector_extra_config={
            "connectors": [{
-                "kv_connector": "TestSharedStorageConnector",
+                "kv_connector":
-                "kv_role": "kv_both",
+                "TestSharedStorageConnector",
+                "kv_role":
+                "kv_both",
                "kv_connector_extra_config": {
                    "shared_storage_path": str(storage_1_path),
                    "name": "storage1",
-                }
+                },
+                "kv_connector_module_path":
+                "tests.v1.kv_connector.unit.utils",
            }, {
-                "kv_connector": "TestSharedStorageConnector",
+                "kv_connector":
-                "kv_role": "kv_both",
+                "TestSharedStorageConnector",
+                "kv_role":
+                "kv_both",
                "kv_connector_extra_config": {
                    "shared_storage_path": str(storage_2_path),
                    "name": "storage2",
-                }
+                },
+                "kv_connector_module_path":
+                "tests.v1.kv_connector.unit.utils",
            }]
        },
    )

--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import tempfile
+import textwrap
 import time
-import uuid
-from collections import defaultdict
-from typing import Optional
 from unittest.mock import patch
 import pytest
+import ray
+from vllm import LLM
+from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
    KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
    NixlConnectorWorker)
 from vllm.forward_context import ForwardContext
+from vllm.mocks.mock_nixl_connector import FakeNixlWrapper
+from vllm.sampling_params import SamplingParams
 from .utils import create_request, create_scheduler, create_vllm_config
+def _make_stub_pkg() -> str:
+    """Return a directory that makes
+       `from nixl._api import nixl_agent` resolve to our FakeNixlWrapper."""
+    td = tempfile.mkdtemp()
+    pkg_root = os.path.join(td, "nixl", "_api")
+    os.makedirs(pkg_root, exist_ok=True)
+    stub = textwrap.dedent("""\
+        # Forward the real FakeNixlWrapper that the driver already defined.
+        print("In fake package")
+        from vllm.mocks.mock_nixl_connector import FakeNixlWrapper as nixl_agent
+    """)
+    with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
+        f.write(stub)
+    # touch parent package
+    open(os.path.join(td, "nixl", "__init__.py"), "w").close()
+    return td
 def test_basic_interface():
    """Unit test for basic NixlConnector interface functionality."""
@@ -41,9 +66,9 @@ def test_basic_interface():
    assert kv_connector_metadata is not None
    assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
-    assert len(kv_connector_metadata.requests) == 1
+    assert len(kv_connector_metadata.reqs_to_recv) == 1
-    assert request_id in kv_connector_metadata.requests
+    assert request_id in kv_connector_metadata.reqs_to_recv
-    req_meta = kv_connector_metadata.requests[request_id]
+    req_meta = kv_connector_metadata.reqs_to_recv[request_id]
    for block_id, block in zip(
            req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
@@ -78,83 +103,12 @@ def test_prompt_less_than_block_size():
    kv_connector_metadata = scheduler_output.kv_connector_metadata
    assert kv_connector_metadata is not None
    assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
-    assert len(kv_connector_metadata.requests) == 0
+    assert len(kv_connector_metadata.reqs_to_recv) == 0
    # This request should be scheduled regularly.
    assert len(scheduler_output.scheduled_new_reqs) == 1
-class FakeNixlWrapper:
-    """Mock implementation of NixlWrapper for testing.
-    We don't inherit from nixl._api.nixl_agent because nixl may not be
-    installed.
-    """
-    AGENT_METADATA = b"fake_agent_metadata"
-    REMOTE_AGENT_NAME = "remote_agent"
-    def __init__(self, agent_name: str, *args, **kwargs):
-        self._cycles_before_xfer_done = 0
-        self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict(
-            lambda: 0)
-    def get_reg_descs(self, caches_data, memory_type: str) -> list:
-        return [str(uuid.uuid4()) for _ in caches_data]
-    def register_memory(self, descs) -> None:
-        pass
-    def get_xfer_descs(self, blocks_data, memory_type: str) -> list:
-        return [str(uuid.uuid4()) for _ in blocks_data]
-    def prep_xfer_dlist(self, agent_name: str, descs: list) -> int:
-        return uuid.uuid4().int
-    def get_agent_metadata(self) -> bytes:
-        return self.AGENT_METADATA
-    def add_remote_agent(self, agent_metadata: bytes) -> str:
-        return self.REMOTE_AGENT_NAME
-    def get_new_notifs(self) -> dict[str, list[bytes]]:
-        # Used to collect done_sending, which we don't test yet.
-        return {}
-    def check_xfer_state(self, handle: int) -> str:
-        if self._check_xfer_state_cycles[
-                handle] >= self._cycles_before_xfer_done:
-            return "DONE"
-        self._check_xfer_state_cycles[handle] += 1
-        return "PROC"
-    def release_xfer_handle(self, handle: int) -> None:
-        pass
-    def send_notif(self, agent_name: str, notif_msg: bytes) -> None:
-        pass
-    def make_prepped_xfer(self,
-                          xfer_type: str,
-                          local_xfer_side_handle: int,
-                          local_block_descs_ids: list[int],
-                          remote_xfer_side_handle: int,
-                          remote_block_descs_ids: list[int],
-                          notif_msg: Optional[bytes] = None) -> int:
-        return uuid.uuid4().int
-    def transfer(self, handle: int) -> str:
-        return "PROC"
-    ############################################################
-    # Follow are for changing the behavior during testing.
-    ############################################################
-    def set_cycles_before_xfer_done(self, cycles: int):
-        """Set the number of cycles before a transfer is considered done."""
-        self._cycles_before_xfer_done = cycles
 class FakeNixlConnectorWorker(NixlConnectorWorker):
    REMOTE_ENGINE_ID = "remote_engine"
@@ -163,8 +117,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
        super().__init__(*args, **kwargs)
        self._hand_shake_latency = hand_shake_latency
-    def _nixl_handshake(self, host: str, port: int,
+    def _nixl_handshake(self, host: str, port: int, remote_tp_size: int,
-                        remote_tp_size: int) -> dict[int, str]:
+                        expected_engine_id: str) -> dict[int, str]:
        # Mimic slow _nixl_handshake, as well as bypass zmq communication.
        time.sleep(self._hand_shake_latency)
        # These should've been done in register_kv_caches(), called by
@@ -174,6 +128,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
        self.num_blocks = 1
        self.dst_num_blocks[self.engine_id] = self.num_blocks
+        assert expected_engine_id == self.REMOTE_ENGINE_ID
        remote_agent_name = self.add_remote_agent(
            NixlAgentMetadata(
                engine_id=self.REMOTE_ENGINE_ID,
@@ -371,3 +327,86 @@ class TestNixlHandshake:
                if cnt_finished_reqs == total_reqs:
                    return
        raise TimeoutError("Took too long to complete async handshake.")
+# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
+# we put here is important. First run ray, it will clean up the resources, then
+# the rest of the tests.
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", None])
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper)
+def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
+    """
+    Test lifecycle of an aborted Remote Prefill request hitting the timeout.
+    -----> P 
+            |  {process request}
+     <-/--- |  {result is NOT delivered, eg proxy is down}
+            |
+            |
+            |  {eventually free blocks}
+    """
+    model_name = "Qwen/Qwen3-0.6B"
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="NixlConnector",
+        kv_role="kv_both",
+    )
+    timeout = 6
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", str(timeout))
+    # Build runtime_env only if we’re using Ray
+    if distributed_executor_backend == "ray":
+        runtime_env = {
+            "working_dir": _make_stub_pkg(),  # ship stub package
+            "env_vars": {
+                "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout),
+            },
+        }
+        ray.init(runtime_env=runtime_env)
+    llm = LLM(
+        model=model_name,
+        enforce_eager=True,
+        gpu_memory_utilization=0.5,
+        kv_transfer_config=kv_transfer_config,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+    remote_prefill_opts = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None,
+    }
+    # Simulate sidecar request
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=1,
+        extra_args={"kv_transfer_params": remote_prefill_opts})
+    scheduler = llm.llm_engine.engine_core.engine_core.scheduler
+    req_to_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks
+    padding = "Just making this request a little longer so that we're sure "
+    "we're not hitting the small-request lower bound beneath which we don't "
+    "actually trigger the whole kv transfer, but rather just recompute the "
+    "blocks on D."
+    _ = llm.generate([f"What is the capital of Japan? {padding}"],
+                     sampling_params)
+    # Request finished but not freed
+    assert '0' in scheduler.finished_req_ids and '0' in req_to_blocks
+    # Some other request, 0 still not freed
+    _ = llm.generate([f"What is the capital of Italy? {padding}"],
+                     sampling_params)
+    assert '0' in req_to_blocks
+    assert '1' in scheduler.finished_req_ids and '1' in req_to_blocks
+    # Wait for timeout and trigger another scheduler loop
+    time.sleep(timeout)
+    _ = llm.generate([f"What is the capital of France? {padding}"],
+                     sampling_params)
+    # Request-0 times out and is cleared!
+    assert '0' not in req_to_blocks
--- a/tests/v1/kv_connector/unit/test_output_aggreagator.py
+++ b/tests/v1/kv_connector/unit/test_output_aggreagator.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from concurrent.futures import Future
+from typing import Optional
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.v1.outputs import ModelRunnerOutput
+class DummyModelRunnerOutput(ModelRunnerOutput):
+    def __init__(self,
+                 finished_sending: Optional[set[str]] = None,
+                 finished_recving: Optional[set[str]] = None):
+        self.finished_sending = finished_sending
+        self.finished_recving = finished_recving
+def test_aggregate_workers_output():
+    aggregator = KVOutputAggregator(world_size=2)
+    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving={'req2'})
+    output2 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    aggregated = aggregator.aggregate([output1, output2])
+    assert aggregated is output1
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving is None
+    output1 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving=None)
+    aggregated = aggregator.aggregate([output1, output2])
+    assert aggregated is output1
+    assert aggregated.finished_sending == {'req1'}
+    assert aggregated.finished_recving is None
+    output1 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving={'req2'})
+    aggregated = aggregator.aggregate([output1, output2])
+    assert aggregated is output1
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving == {'req2'}
+def test_async_aggregate_workers_output():
+    aggregator = KVOutputAggregator(world_size=2)
+    future1: Future[DummyModelRunnerOutput] = Future()
+    future2: Future[DummyModelRunnerOutput] = Future()
+    result_future = aggregator.async_aggregate([future1, future2])
+    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving={'req2'})
+    output2 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    future1.set_result(output1)
+    future2.set_result(output2)
+    assert result_future.done()
+    aggregated = result_future.result()
+    assert aggregated is output1
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving is None
+    future1 = Future()
+    future2 = Future()
+    result_future = aggregator.async_aggregate([future1, future2])
+    output1 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving=None)
+    future1.set_result(output1)
+    future2.set_result(output2)
+    assert result_future.done()
+    aggregated = result_future.result()
+    assert aggregated is output1
+    assert aggregated.finished_sending == {'req1'}
+    assert aggregated.finished_recving is None
+    future1 = Future()
+    future2 = Future()
+    result_future = aggregator.async_aggregate([future1, future2])
+    output1 = DummyModelRunnerOutput(finished_sending=None,
+                                     finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
+                                     finished_recving={'req2'})
+    future1.set_result(output1)
+    future2.set_result(output2)
+    assert result_future.done()
+    aggregated = result_future.result()
+    assert aggregated is output1
+    assert aggregated.finished_sending is None
+    assert aggregated.finished_recving == {'req2'}
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
+from collections import defaultdict
 from typing import Any, Optional
 import torch
@@ -7,6 +9,11 @@ import torch
 from vllm import SamplingParams
 from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
                         ModelConfig, SchedulerConfig, VllmConfig)
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
+    SharedStorageConnector)
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                        KVCacheGroupSpec)
@@ -187,3 +194,58 @@ def create_model_runner_output(
        finished_sending=finished_sending,
        finished_recving=finished_recving,
    )
+class TestSharedStorageConnector(SharedStorageConnector):
+    def __init__(self, config: VllmConfig, role):
+        self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
+        self._connector = SharedStorageConnector(config, role)
+        self.call_record: dict[str, int] = defaultdict(int)
+        # Use a unique temp file per connector
+        self._event_file = tempfile.gettempdir(
+        ) + f"/connector_{self.name}-{self.role.name}_events.log"
+        # Start with an empty file
+        with open(self._event_file, "w") as _:
+            pass
+    def __getattribute__(self, name):
+        if name in ("_connector", "call_record", "name", "_event_file",
+                    "__class__", "__dict__", "__getattribute__",
+                    "__init__"):  # avoid recursion
+            return object.__getattribute__(self, name)
+        if not hasattr(self._connector, name):
+            return object.__getattribute__(self, name)
+        attr = getattr(self._connector, name)
+        # Intercept calls to the connector interface and write an event
+        # for each one to a file, which can be read back in the main test proc.
+        if callable(attr):
+            def wrapper(*args, **kwargs):
+                self.call_record[name] += 1
+                # Include args that we're interested in
+                to_log = [name]
+                for arg in args:
+                    if isinstance(arg, int):
+                        to_log.append(str(arg))
+                    elif isinstance(arg, KVCacheBlocks):
+                        to_log.append(
+                            f"num_blocks={[len(b) for b in arg.blocks]}")
+                # Log the event as a line to the file
+                try:
+                    with open(self._event_file, "a") as f:
+                        f.write(' '.join(to_log) + "\n")
+                except Exception as e:
+                    print(f"[ERROR] Could not log event {name} "
+                          f"for {self.name}: {e}")
+                return attr(*args, **kwargs)
+            return wrapper
+        return attr
+KVConnectorFactory.register_connector("TestSharedStorageConnector", __name__,
+                                      TestSharedStorageConnector.__name__)
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import pytest
 import ray
+from vllm.config import ModelDType
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
 from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
@@ -27,7 +30,7 @@ MODELS = [
 def test_engine_log_metrics_ray(
    example_prompts,
    model: str,
-    dtype: str,
+    dtype: ModelDType,
    max_tokens: int,
 ) -> None:
    """ Simple smoke test, verifying this can be used without exceptions.
@@ -37,11 +40,14 @@ def test_engine_log_metrics_ray(
    class EngineTestActor:
        async def run(self):
-            engine_args = AsyncEngineArgs(
+            # Set environment variable inside the Ray actor since environment
-                model=model,
+            # variables from pytest fixtures don't propagate to Ray actors
-                dtype=dtype,
+            os.environ['VLLM_USE_V1'] = '1'
-                disable_log_stats=False,
-            )
+            engine_args = AsyncEngineArgs(model=model,
+                                          dtype=dtype,
+                                          disable_log_stats=False,
+                                          enforce_eager=True)
            engine = AsyncLLM.from_engine_args(
                engine_args, stat_loggers=[RayPrometheusStatLogger])

--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -13,6 +13,7 @@ from tests.v1.sample.utils import (
    assert_incr_detok_str_matches_non_incr_detok_str,
    compute_correct_cumulative_logprob, get_test_batch)
 from vllm import SamplingParams
+from vllm.config import LogprobsMode
 from ...conftest import HfRunner, VllmRunner
 from ...utils import models_path_prefix
@@ -114,7 +115,7 @@ def _run_and_validate(
    max_tokens: int,
    do_apc: bool,
 ) -> None:
-    vllm_results = vllm_model.model.generate(
+    vllm_results = vllm_model.llm.generate(
        test_prompts, sampling_params=vllm_sampling_params)
    for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
@@ -290,7 +291,7 @@ def test_get_logprobs_and_prompt_logprobs(
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
-        do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
+        do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
        if do_apc and (temperature < 2.0
                       or batch_logprobs_composition != SAMPLE_PROMPT):
            # Skip some test-cases to save time.
@@ -380,7 +381,7 @@ def test_none_logprobs(vllm_model, example_prompts,
            prompt_logprobs=None,
            temperature=0.0,
        )
-        results_logprobs_none = vllm_model.model.generate(
+        results_logprobs_none = vllm_model.llm.generate(
            example_prompts,
            sampling_params=sampling_params_logprobs_none,
        )
@@ -410,7 +411,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
                                                       logprobs=0,
                                                       prompt_logprobs=0,
                                                       temperature=0.0)
-        results_logprobs_zero = vllm_model.model.generate(
+        results_logprobs_zero = vllm_model.llm.generate(
            example_prompts, sampling_params=sampling_params_logprobs_zero)
        for i in range(len(results_logprobs_zero)):
@@ -428,3 +429,45 @@ def test_zero_logprobs(vllm_model, example_prompts,
            # prompt token
            assert prompt_logprobs is not None
            assert len(prompt_token_ids) == len(prompt_logprobs)
+@pytest.mark.parametrize(
+    "logprobs_mode",
+    ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"])
+def test_logprobs_mode(logprobs_mode: LogprobsMode,
+                       monkeypatch: pytest.MonkeyPatch):
+    """Test with LLM engine with different logprobs_mode.
+    For logprobs, we should have non-positive values.
+    For logits, we should expect at least one positive values.
+    """
+    from vllm import LLM
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        llm = LLM(
+            "facebook/opt-125m",
+            max_logprobs=5,
+            enable_prefix_caching=False,
+            # 2 other llms alive during whole session
+            gpu_memory_utilization=0.05,
+            max_model_len=16,
+            logprobs_mode=logprobs_mode)
+        vllm_sampling_params = SamplingParams(logprobs=1)
+        results = llm.generate(["Hello world"],
+                               sampling_params=vllm_sampling_params)
+        total_token_with_logprobs = 0
+        positive_values = 0
+        for output in results[0].outputs:
+            for logprobs in output.logprobs:
+                for token_id in logprobs:
+                    logprob = logprobs[token_id]
+                    if "logprobs" in logprobs_mode:
+                        assert logprob.logprob <= 0
+                    if logprob.logprob > 0:
+                        positive_values = positive_values + 1
+                    total_token_with_logprobs = total_token_with_logprobs + 1
+        assert total_token_with_logprobs >= len(results[0].outputs)
+        if "logits" in logprobs_mode:
+            assert positive_values > 0
+        del llm
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -15,30 +15,30 @@ PROMPT = "Hello my name is Robert and I"
 @pytest.fixture(scope="module")
-def model() -> LLM:
+def llm() -> LLM:
    # Disable prefix caching so that we can test prompt logprobs.
    # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
    # is merged
    return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
-def test_n_gt_1(model):
+def test_n_gt_1(llm):
    """ParallelSampling is supported."""
    params = SamplingParams(n=3)
-    outputs = model.generate(PROMPT, params)
+    outputs = llm.generate(PROMPT, params)
    assert len(outputs[0].outputs) == 3
-def test_best_of(model):
+def test_best_of(llm):
    """Raise a ValueError since best_of is deprecated."""
    params = SamplingParams(n=2, best_of=3)
    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, params)
+        _ = llm.generate(PROMPT, params)
-def test_penalties(model):
+def test_penalties(llm):
    """Check that we do not get errors if applied."""
    params = SamplingParams(
@@ -50,18 +50,18 @@ def test_penalties(model):
        top_p=0.5,
        top_k=3,
    )
-    _ = model.generate(PROMPT, params)
+    _ = llm.generate(PROMPT, params)
-def test_stop(model):
+def test_stop(llm):
    """Check that we respect the stop words."""
-    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
    split_text = output[0].outputs[0].text.split()
    STOP_IDX = 5
    params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
    new_split_text = output[0].outputs[0].text.split()
    # Output should not contain the stop word.
@@ -70,40 +70,40 @@ def test_stop(model):
    params = SamplingParams(temperature=0,
                            stop=split_text[STOP_IDX],
                            include_stop_str_in_output=True)
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
    new_split_text = output[0].outputs[0].text.split()
    # Output should contain the stop word.
    assert len(new_split_text) == STOP_IDX + 1
-def test_stop_token_ids(model):
+def test_stop_token_ids(llm):
    """Check that we respect the stop token ids."""
-    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
    stop_token_id_0 = output[0].outputs[0].token_ids[5]
    stop_token_id_1 = output[0].outputs[0].token_ids[6]
    stop_token_ids = [stop_token_id_1, stop_token_id_0]
    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
    stop_token_ids = [stop_token_id_0, stop_token_id_1]
    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
-def test_detokenize_false(model):
+def test_detokenize_false(llm):
    """Check that detokenize=False option works."""
-    output = model.generate(PROMPT, SamplingParams(detokenize=False))
+    output = llm.generate(PROMPT, SamplingParams(detokenize=False))
    assert len(output[0].outputs[0].token_ids) > 0
    assert len(output[0].outputs[0].text) == 0
-    output = model.generate(
+    output = llm.generate(
        PROMPT, SamplingParams(detokenize=False, logprobs=3,
                               prompt_logprobs=3))
    assert len(output[0].outputs[0].token_ids) > 0
@@ -119,28 +119,28 @@ def test_detokenize_false(model):
            assert all(lp.decoded_token is None for lp in logprobs.values())
-def test_bad_words(model):
+def test_bad_words(llm):
    """Check that we respect bad words."""
-    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
    split_text = output[0].outputs[0].text.split()
    bad_words_1 = " ".join(split_text[:2])
    params = SamplingParams(temperature=0, bad_words=[bad_words_1])
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
    new_text = output[0].outputs[0].text
    assert bad_words_1 not in new_text
    bad_words_2 = new_text.split()[-1]
    params = SamplingParams(temperature=0,
                            bad_words=[bad_words_1, bad_words_2])
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
    new_text = output[0].outputs[0].text
    assert bad_words_1 not in new_text
    assert bad_words_2 not in new_text
-def test_logits_processor(model):
+def test_logits_processor(llm):
    """Check that we reject logits processor."""
    # This sample logits processor gives infinite score to the i-th token,
@@ -151,47 +151,45 @@ def test_logits_processor(model):
        return logits
    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT,
+        _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
-                           SamplingParams(logits_processors=[pick_ith]))
-def test_allowed_token_ids(model):
+def test_allowed_token_ids(llm):
    """Check that we can use allowed_token_ids."""
    TOKEN_ID = 10
    allowed_token_ids = [TOKEN_ID]
-    output = model.generate(
+    output = llm.generate(PROMPT,
-        PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
+                          SamplingParams(allowed_token_ids=allowed_token_ids))
    assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
    # Reject empty allowed_token_ids.
    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
    # Reject negative token id.
    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
    # Reject out of vocabulary.
    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT,
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
-                           SamplingParams(allowed_token_ids=[10000000]))
-def test_priority(model):
+def test_priority(llm):
    """Check that we reject requests with priority."""
    # Reject all allowed token ids
    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, priority=[1])
+        _ = llm.generate(PROMPT, priority=[1])
-def test_seed(model):
+def test_seed(llm):
    """Check that seed impacts randomness."""
-    out_1 = model.generate(PROMPT, SamplingParams(seed=42))
+    out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
-    out_2 = model.generate(PROMPT, SamplingParams(seed=42))
+    out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
-    out_3 = model.generate(PROMPT, SamplingParams(seed=43))
+    out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
    assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
    assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -6,6 +6,10 @@ from unittest import mock
 import pytest
 import torch
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
+                                      create_standard_kv_cache_spec,
+                                      get_attention_backend)
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                         VllmConfig)
@@ -64,13 +68,19 @@ def test_prepare_inputs():
    """
    device = torch.device(current_platform.device_type)
-    # a = 4, b = 7, c = 5
+    # q1 = 4, q2 = 7, q3 = 5
    # n1 = 1, n2 = 3, n3 = 2
-    # Cumulative lengths: [0, 4, 11, 16]
+    batch_spec = BatchSpec(
-    cu_target_query_lens = torch.tensor([0, 4, 11, 16],
+        seq_lens=[4, 7, 5],
-                                        dtype=torch.int32,
+        query_lens=[4, 7, 5],
-                                        device=device)
+    )
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size=16,
+        device=device,
+    )
    # Rejected tokens per request: [1, 3, 2]
    num_rejected_tokens = torch.tensor([1, 3, 2],
@@ -104,15 +114,13 @@ def test_prepare_inputs():
        ],
        dtype=torch.int32,
        device=device)
+    proposer = _create_proposer("eagle", 1)
-    # n1 + n2 + n3 - a - b -c
+    updated_metadata, token_indices = proposer.prepare_inputs(
-    num_tokens = cu_target_query_lens[-1].item() - num_rejected_tokens.sum(
+        common_attn_metadata, num_rejected_tokens.cpu())
-    ).item()
-    cu_num_tokens, token_indices = EagleProposer.prepare_inputs(
+    assert torch.equal(updated_metadata.query_start_loc,
-        cu_target_query_lens, num_rejected_tokens, num_tokens)
+                       expected_cu_num_tokens)
-    assert torch.equal(cu_num_tokens, expected_cu_num_tokens)
    assert token_indices.shape[0] == expected_cu_num_tokens[-1].item()
    assert torch.equal(token_indices, expected_token_indices)
@@ -209,6 +217,7 @@ def test_propose(num_speculative_tokens):
    seq_len_2 = 3
    total_tokens = seq_len_1 + seq_len_2
    vocab_size = 100
+    seq_lens = [seq_len_1, seq_len_2]
    # Create proposer first so we can use its actual hidden_size
    proposer = _create_proposer("eagle", num_speculative_tokens)
@@ -270,9 +279,16 @@ def test_propose(num_speculative_tokens):
    proposer.attn_layer_names = ["layer.0"]
    # Create input tensors
-    cu_num_tokens = torch.tensor([0, seq_len_1, total_tokens],
+    batch_spec = BatchSpec(
-                                 dtype=torch.int32,
+        seq_lens=seq_lens,
-                                 device=device)
+        query_lens=seq_lens,
+    )
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size=16,
+        device=device,
+    )
    target_token_ids = torch.randint(0,
                                     vocab_size, (total_tokens, ),
@@ -284,25 +300,29 @@ def test_propose(num_speculative_tokens):
    target_hidden_states = torch.randn(total_tokens,
                                       hidden_size,
                                       device=device)
-    target_slot_mapping = torch.randint(0,
-                                        100, (total_tokens, ),
-                                        device=device)
    next_token_ids = torch.randint(0,
                                   vocab_size, (batch_size, ),
                                   dtype=torch.int32,
                                   device=device)
-    block_table = torch.randint(0, 10, (batch_size, 10), device=device)
    sampling_metadata = mock.MagicMock()
-    # Call the method under test
+    attn_metadata_builder_cls, _ = get_attention_backend(
+        _Backend.FLASH_ATTN_VLLM_V1)
+    attn_metadata_builder = attn_metadata_builder_cls(
+        kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
+        vllm_config=proposer.vllm_config,
+        device=device,
+    )
+    # Mock runner for attention metadata building
+    proposer.runner = mock.MagicMock()
+    proposer.runner.attn_metadata_builders = [attn_metadata_builder]
    result = proposer.propose(target_token_ids=target_token_ids,
                              target_positions=target_positions,
                              target_hidden_states=target_hidden_states,
-                              target_slot_mapping=target_slot_mapping,
                              next_token_ids=next_token_ids,
-                              cu_num_tokens=cu_num_tokens,
+                              common_attn_metadata=common_attn_metadata,
-                              block_table=block_table,
                              sampling_metadata=sampling_metadata)
    assert result.shape == (batch_size, num_speculative_tokens)

--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -93,8 +93,10 @@ async def test_load(output_kind: RequestOutputKind,
        def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
            stats_loggers[engine_index] = self
-        def record(self, scheduler_stats: Optional[SchedulerStats],
+        def record(self,
-                   iteration_stats: Optional[IterationStats]):
+                   scheduler_stats: Optional[SchedulerStats],
+                   iteration_stats: Optional[IterationStats],
+                   engine_idx: int = 0):
            if iteration_stats:
                self.finished_req_count += len(
                    iteration_stats.finished_requests)

--- a/tests/v1/test_external_lb_dp.py
+++ b/tests/v1/test_external_lb_dp.py
@@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
 # Number of data parallel ranks for external LB testing
 DP_SIZE = int(os.getenv("DP_SIZE", "2"))
-# Default tensor parallell size to use
+# Default tensor parallel size to use
 TP_SIZE = int(os.getenv("TP_SIZE", "1"))

--- a/tests/v1/test_hybrid_lb_dp.py
+++ b/tests/v1/test_hybrid_lb_dp.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+import threading
+import time
+from contextlib import AsyncExitStack
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from tests.utils import RemoteOpenAIServer
+from tests.v1.test_utils import check_request_balancing
+from vllm.platforms import Platform
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+# Number of data parallel ranks for hybrid LB testing (4 total)
+DP_SIZE = int(os.getenv("DP_SIZE", "4"))
+# Default tensor parallel size to use
+TP_SIZE = int(os.getenv("TP_SIZE", "1"))
+# Number of nodes (2 nodes, each with 2 DP ranks)
+NUM_NODES = 2
+DP_SIZE_LOCAL = DP_SIZE // NUM_NODES  # 2 ranks per node
+class HybridLBServerManager:
+    """Manages hybrid data parallel vLLM server instances where each node 
+    runs a single logical API server that balances requests only to the 
+    DP engines running on that same node."""
+    def __init__(self,
+                 model_name: str,
+                 dp_size: int,
+                 api_server_count: int,
+                 base_server_args: list,
+                 dp_size_local: int = DP_SIZE_LOCAL,
+                 tp_size: int = TP_SIZE):
+        self.model_name = model_name
+        self.dp_size = dp_size
+        self.dp_size_local = dp_size_local
+        self.tp_size = tp_size
+        self.api_server_count = api_server_count
+        self.base_server_args = base_server_args
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.server_threads: list[threading.Thread] = []
+        self.num_nodes = dp_size // dp_size_local
+    def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
+        """Start all server instances for hybrid LB mode."""
+        for node_id in range(self.num_nodes):
+            # Create server args for this specific node
+            server_args = self.base_server_args.copy()
+            # Calculate start rank for this node
+            start_rank = node_id * self.dp_size_local
+            # Add hybrid LB specific arguments
+            server_args.extend([
+                "--data-parallel-size",
+                str(self.dp_size),
+                "--data-parallel-size-local",
+                str(self.dp_size_local),
+                "--data-parallel-start-rank",
+                str(start_rank),
+                "--data-parallel-hybrid-lb",  # Enable hybrid LB mode
+                "--tensor-parallel-size",
+                str(self.tp_size),
+                "--port",
+                str(8000 + node_id),  # Different port for each node
+                "--api-server-count",
+                str(self.api_server_count),
+                "--data-parallel-address",
+                "127.0.0.1",
+                "--data-parallel-rpc-port",
+                "13345",
+            ])
+            # Use a thread to start each server to allow parallel initialization
+            def start_server(node: int, sargs: list[str]):
+                try:
+                    # Calculate GPU devices for this node
+                    gpus_per_node = self.dp_size_local * self.tp_size
+                    gpu_start = node * gpus_per_node
+                    gpu_end = gpu_start + gpus_per_node
+                    # Start the server
+                    server = RemoteOpenAIServer(
+                        self.model_name,
+                        sargs,
+                        auto_port=False,
+                        env_dict={
+                            "CUDA_VISIBLE_DEVICES":
+                            ",".join(
+                                str(Platform.device_id_to_physical_device_id(
+                                    i)) for i in range(gpu_start, gpu_end))
+                        })
+                    server.__enter__()
+                    print(f"Hybrid LB node {node} started successfully with "
+                          f"{self.dp_size_local} local DP ranks and "
+                          f"{self.api_server_count} API servers")
+                    self.servers.append((server, sargs))
+                except Exception as e:
+                    print(f"Failed to start hybrid LB node {node}: {e}")
+                    raise
+            thread = threading.Thread(target=start_server,
+                                      args=(node_id, server_args))
+            thread.start()
+            self.server_threads.append(thread)
+        # Wait for all servers to start
+        for thread in self.server_threads:
+            thread.join()
+        # Give servers additional time to fully initialize and coordinate
+        time.sleep(3)
+        if len(self.servers) != self.num_nodes:
+            raise Exception("Servers failed to start")
+        return self.servers
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stop all server instances."""
+        while self.servers:
+            try:
+                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
+            except Exception as e:
+                print(f"Error stopping server: {e}")
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+@pytest.fixture(scope="module", params=[1])  # Only 1 API server for now
+def servers(request, default_server_args):
+    api_server_count = request.param
+    with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
+                               default_server_args, DP_SIZE_LOCAL,
+                               TP_SIZE) as server_list:
+        yield server_list
+@pytest_asyncio.fixture
+async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
+    # Create a client for each node (each node has its own API endpoint)
+    async with AsyncExitStack() as stack:
+        yield [
+            await stack.enter_async_context(server.get_async_client())
+            for server, _ in servers
+        ]
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
+                                    servers: list[tuple[RemoteOpenAIServer,
+                                                        list[str]]],
+                                    model_name: str) -> None:
+    async def make_request(client: openai.AsyncOpenAI):
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+    # Test single request to each node
+    for i, client in enumerate(clients):
+        result = await make_request(client)
+        assert result is not None
+        print(
+            f"Hybrid LB node {i} handled single completion request successfully"
+        )
+    await asyncio.sleep(0.5)
+    # Send requests to all nodes - each should balance within its local DP ranks
+    num_requests_per_node = 25  # Total 50 requests across 2 nodes
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [make_request(client) for _ in range(num_requests_per_node)]
+        all_tasks.extend(tasks)
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_node * len(clients)
+    assert all(completion is not None for completion in results)
+    await asyncio.sleep(0.5)
+    # Second burst of requests
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [make_request(client) for _ in range(num_requests_per_node)]
+        all_tasks.extend(tasks)
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_node * len(clients)
+    assert all(completion is not None for completion in results)
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(
+        f"Successfully completed hybrid LB test with {len(clients)} nodes "
+        f"({DP_SIZE_LOCAL} DP ranks each, API server count: {api_server_count})"
+    )
+    # Check request balancing within each node
+    for i, (server, _) in enumerate(servers):
+        print(f"Checking request balancing for node {i}")
+        check_request_balancing(server, DP_SIZE_LOCAL)
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_hybrid_lb_completion_streaming(clients: list[
+    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
+                                              model_name: str) -> None:
+    prompt = "What is an LLM?"
+    async def make_streaming_request(client: openai.AsyncOpenAI):
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+    # Test single request to each node
+    for i, client in enumerate(clients):
+        result = await make_streaming_request(client)
+        assert result is not None
+        print(
+            f"Hybrid LB node {i} handled single streaming request successfully"
+        )
+    await asyncio.sleep(0.5)
+    # Send streaming requests to all nodes
+    num_requests_per_node = 25  # Total 50 requests across 2 nodes
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [
+            make_streaming_request(client)
+            for _ in range(num_requests_per_node)
+        ]
+        all_tasks.extend(tasks)
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_node * len(clients)
+    assert all(results), "Not all streaming requests completed successfully."
+    await asyncio.sleep(0.5)
+    # Second burst of streaming requests
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [
+            make_streaming_request(client)
+            for _ in range(num_requests_per_node)
+        ]
+        all_tasks.extend(tasks)
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_node * len(clients)
+    assert all(results), "Not all streaming requests completed successfully."
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(f"Successfully completed hybrid LB streaming test with "
+          f"{len(clients)} nodes ({DP_SIZE_LOCAL} DP ranks each, "
+          f"API server count: {api_server_count})")
+    # Check request balancing within each node
+    for i, (server, _) in enumerate(servers):
+        print(f"Checking streaming request balancing for node {i}")
+        check_request_balancing(server, DP_SIZE_LOCAL)
--- a/tests/v1/test_internal_lb_dp.py
+++ b/tests/v1/test_internal_lb_dp.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+import threading
+import time
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from tests.utils import RemoteOpenAIServer
+from tests.v1.test_utils import check_request_balancing
+from vllm.platforms import Platform
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+# Number of data parallel ranks for multi-node internal LB testing
+DP_SIZE = int(os.getenv("DP_SIZE", "2"))
+# Default tensor parallel size to use
+TP_SIZE = int(os.getenv("TP_SIZE", "1"))
+# Number of nodes to simulate
+NUM_NODES = 2
+class MultinodeInternalLBServerManager:
+    """Manages multi-node data parallel vLLM server instances for internal
+    load balancer testing using --headless mode."""
+    def __init__(self,
+                 model_name: str,
+                 dp_size: int,
+                 api_server_count: int,
+                 base_server_args: list,
+                 dp_per_node: int = 1,
+                 tp_size: int = TP_SIZE):
+        self.model_name = model_name
+        self.dp_size = dp_size
+        self.dp_per_node = dp_per_node
+        self.tp_size = tp_size
+        self.api_server_count = api_server_count
+        self.base_server_args = base_server_args
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.server_threads: list[threading.Thread] = []
+    def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
+        """Start all server instances for multi-node internal LB mode."""
+        for rank in range(0, self.dp_size, self.dp_per_node):
+            # Create server args for this specific rank
+            server_args = self.base_server_args.copy()
+            if rank == 0:
+                # Head node - runs API server and first DP rank
+                server_args.extend([
+                    "--data-parallel-size",
+                    str(self.dp_size),
+                    "--data-parallel-size-local",
+                    str(self.dp_per_node),
+                    "--tensor-parallel-size",
+                    str(self.tp_size),
+                    "--port",
+                    "8000",  # Single endpoint for all requests
+                    "--api-server-count",
+                    str(self.api_server_count),
+                    "--data-parallel-address",
+                    "127.0.0.1",
+                    "--data-parallel-rpc-port",
+                    "13345",
+                ])
+            else:
+                # Secondary nodes - run in headless mode
+                server_args.extend([
+                    "--headless",
+                    "--data-parallel-size",
+                    str(self.dp_size),
+                    "--data-parallel-size-local",
+                    str(self.dp_per_node),
+                    "--data-parallel-start-rank",
+                    str(rank),
+                    "--tensor-parallel-size",
+                    str(self.tp_size),
+                    "--data-parallel-address",
+                    "127.0.0.1",
+                    "--data-parallel-rpc-port",
+                    "13345",
+                ])
+            # Use a thread to start each server to allow parallel initialization
+            def start_server(r: int, sargs: list[str]):
+                gpus_per_node = self.tp_size * self.dp_per_node
+                try:
+                    # Start the server
+                    server = RemoteOpenAIServer(
+                        self.model_name,
+                        sargs,
+                        auto_port=False,
+                        env_dict={
+                            "CUDA_VISIBLE_DEVICES":
+                            ",".join(
+                                str(Platform.device_id_to_physical_device_id(
+                                    i)) for i in range(r, r + gpus_per_node))
+                        })
+                    server.__enter__()
+                    if r == 0:
+                        print(
+                            f"Head node (rank {r}) started successfully with "
+                            f"{self.api_server_count} API servers")
+                    else:
+                        print(f"Headless node (rank {r}) started successfully")
+                    self.servers.append((server, sargs))
+                except Exception as e:
+                    print(f"Failed to start server rank {r}: {e}")
+                    raise
+            thread = threading.Thread(target=start_server,
+                                      args=(rank, server_args))
+            thread.start()
+            self.server_threads.append(thread)
+        # Wait for all servers to start
+        for thread in self.server_threads:
+            thread.join()
+        # Give servers additional time to fully initialize and coordinate
+        time.sleep(3)
+        if len(self.servers) != self.dp_size // self.dp_per_node:
+            raise Exception("Servers failed to start")
+        return self.servers
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stop all server instances."""
+        while self.servers:
+            try:
+                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
+            except Exception as e:
+                print(f"Error stopping server: {e}")
+class APIOnlyServerManager:
+    """Manages API-only server (Node 0) and headless engines server (Node 1)
+    for testing separated API server and engine configuration."""
+    def __init__(self,
+                 model_name: str,
+                 dp_size: int,
+                 api_server_count: int,
+                 base_server_args: list,
+                 tp_size: int = TP_SIZE):
+        self.model_name = model_name
+        self.dp_size = dp_size
+        self.tp_size = tp_size
+        self.api_server_count = api_server_count
+        self.base_server_args = base_server_args
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.server_threads: list[threading.Thread] = []
+    def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
+        """Start API-only server and headless engines server."""
+        # Start API-only server (Node 0) - no engines, only API server
+        api_server_args = self.base_server_args.copy()
+        api_server_args.extend([
+            "--data-parallel-size",
+            str(self.dp_size),
+            "--data-parallel-size-local",
+            "0",  # No engines on this node
+            "--tensor-parallel-size",
+            str(self.tp_size),
+            "--port",
+            "8000",
+            "--api-server-count",
+            str(self.api_server_count),
+            "--data-parallel-address",
+            "127.0.0.1",
+            "--data-parallel-rpc-port",
+            "13345",
+        ])
+        # Start headless engines server (Node 1) - all engines, no API server
+        engines_server_args = self.base_server_args.copy()
+        engines_server_args.extend([
+            "--headless",
+            "--data-parallel-size",
+            str(self.dp_size),
+            "--data-parallel-size-local",
+            str(self.dp_size),  # All engines on this node
+            "--tensor-parallel-size",
+            str(self.tp_size),
+            "--data-parallel-address",
+            "127.0.0.1",
+            "--data-parallel-rpc-port",
+            "13345",
+        ])
+        # Use threads to start both servers in parallel
+        def start_api_server():
+            try:
+                server = RemoteOpenAIServer(
+                    self.model_name,
+                    api_server_args,
+                    auto_port=False,
+                    env_dict={})  # No GPUs needed for API-only server
+                server.__enter__()
+                print(f"API-only server started successfully with "
+                      f"{self.api_server_count} API servers")
+                self.servers.append((server, api_server_args))
+            except Exception as e:
+                print(f"Failed to start API-only server: {e}")
+                raise
+        def start_engines_server():
+            try:
+                server = RemoteOpenAIServer(
+                    self.model_name,
+                    engines_server_args,
+                    auto_port=False,
+                    env_dict={
+                        "CUDA_VISIBLE_DEVICES":
+                        ",".join(
+                            str(Platform.device_id_to_physical_device_id(i))
+                            for i in range(self.dp_size * self.tp_size))
+                    })
+                server.__enter__()
+                print(f"Headless engines server started successfully with "
+                      f"{self.dp_size} engines")
+                self.servers.append((server, engines_server_args))
+            except Exception as e:
+                print(f"Failed to start headless engines server: {e}")
+                raise
+        # Start API server first
+        api_thread = threading.Thread(target=start_api_server)
+        api_thread.start()
+        self.server_threads.append(api_thread)
+        # Start engines server second
+        engines_thread = threading.Thread(target=start_engines_server)
+        engines_thread.start()
+        self.server_threads.append(engines_thread)
+        # Wait for both servers to start
+        for thread in self.server_threads:
+            thread.join()
+        # Give servers additional time to fully initialize and coordinate
+        time.sleep(3)
+        if len(self.servers) != 2:
+            raise Exception("Both servers failed to start")
+        return self.servers
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stop both server instances."""
+        while self.servers:
+            try:
+                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
+            except Exception as e:
+                print(f"Error stopping server: {e}")
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+@pytest.fixture(scope="module", params=[1, 4])
+def servers(request, default_server_args):
+    api_server_count = request.param
+    with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
+                                          api_server_count,
+                                          default_server_args,
+                                          DP_SIZE // NUM_NODES,
+                                          TP_SIZE) as server_list:
+        yield server_list
+@pytest.fixture(scope="module", params=[1, 4])
+def api_only_servers(request, default_server_args):
+    """Fixture for API-only server + headless engines configuration."""
+    api_server_count = request.param
+    with APIOnlyServerManager(MODEL_NAME, DP_SIZE, api_server_count,
+                              default_server_args, TP_SIZE) as server_list:
+        yield server_list
+@pytest_asyncio.fixture
+async def client(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
+    # For internal LB, we only connect to the head node (rank 0)
+    # which provides the single API endpoint
+    head_server = servers[0][0]
+    async with head_server.get_async_client() as client:
+        yield client
+@pytest_asyncio.fixture
+async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer,
+                                                       list[str]]]):
+    """Client fixture for API-only server configuration."""
+    # Connect to the API-only server (first server in the list)
+    api_server = api_only_servers[0][0]
+    async with api_server.get_async_client() as client:
+        yield client
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
+                                       servers: list[tuple[RemoteOpenAIServer,
+                                                           list[str]]],
+                                       model_name: str) -> None:
+    async def make_request():
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+    # Test single request
+    result = await make_request()
+    assert result is not None
+    print(
+        "Multi-node internal LB handled single completion request successfully"
+    )
+    await asyncio.sleep(0.5)
+    # Send multiple requests - internal LB should distribute across DP ranks
+    num_requests = 50
+    all_tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+    await asyncio.sleep(0.5)
+    # Second burst of requests
+    all_tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(f"Successfully completed multi-node internal LB test with "
+          f"{len(servers)} DP ranks (API server count: {api_server_count})")
+    # Check request balancing via Prometheus metrics
+    head_server = servers[0][0]
+    check_request_balancing(head_server, DP_SIZE)
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
+                                                 servers: list[
+                                                     tuple[RemoteOpenAIServer,
+                                                           list[str]]],
+                                                 model_name: str) -> None:
+    prompt = "What is an LLM?"
+    async def make_streaming_request():
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+    # Test single streaming request
+    result = await make_streaming_request()
+    assert result is not None
+    print(
+        "Multi-node internal LB handled single streaming request successfully")
+    await asyncio.sleep(0.5)
+    # Send multiple streaming requests - internal LB should distribute across
+    # DP ranks
+    num_requests = 50
+    all_tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(results), "Not all streaming requests completed successfully."
+    await asyncio.sleep(0.5)
+    # Second burst of streaming requests
+    all_tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(results), "Not all streaming requests completed successfully."
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(f"Successfully completed multi-node internal LB streaming test with "
+          f"{len(servers)} DP ranks (API server count: {api_server_count})")
+    # Check request balancing via Prometheus metrics
+    head_server = servers[0][0]
+    check_request_balancing(head_server, DP_SIZE)
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_api_only_multinode_dp_completion(
+        api_only_client: openai.AsyncOpenAI,
+        api_only_servers: list[tuple[RemoteOpenAIServer,
+                                     list[str]]], model_name: str) -> None:
+    """Test API-only server with all engines on separate headless server."""
+    async def make_request():
+        completion = await api_only_client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes
+        # early or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+    # Test single request
+    result = await make_request()
+    assert result is not None
+    print("API-only server handled single completion request successfully")
+    await asyncio.sleep(0.5)
+    # Send multiple requests - should be distributed across engines on
+    # headless server
+    num_requests = 50
+    all_tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+    await asyncio.sleep(0.5)
+    # Second burst of requests
+    all_tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+    _, api_server_args = api_only_servers[0]
+    api_server_count = (
+        api_server_args.count('--api-server-count')
+        and api_server_args[api_server_args.index('--api-server-count') + 1]
+        or 1)
+    print(f"Successfully completed API-only multi-node test with {DP_SIZE} "
+          f"engines on headless server (API server count: {api_server_count})")
+    # Check request balancing via Prometheus metrics
+    api_server = api_only_servers[0][0]
+    check_request_balancing(api_server, DP_SIZE)
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_api_only_multinode_dp_completion_streaming(
+        api_only_client: openai.AsyncOpenAI,
+        api_only_servers: list[tuple[RemoteOpenAIServer,
+                                     list[str]]], model_name: str) -> None:
+    """Test API-only server streaming with all engines on separate
+    headless server."""
+    prompt = "What is an LLM?"
+    async def make_streaming_request():
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await api_only_client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+        # Perform the streaming request
+        stream = await api_only_client.completions.create(model=model_name,
+                                                          prompt=prompt,
+                                                          max_tokens=5,
+                                                          temperature=0.0,
+                                                          stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+    # Test single streaming request
+    result = await make_streaming_request()
+    assert result is not None
+    print("API-only server handled single streaming request successfully")
+    await asyncio.sleep(0.5)
+    # Send multiple streaming requests - should be distributed across engines
+    num_requests = 50
+    all_tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(results), "Not all streaming requests completed successfully."
+    await asyncio.sleep(0.5)
+    # Second burst of streaming requests
+    all_tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests
+    assert all(results), "Not all streaming requests completed successfully."
+    _, api_server_args = api_only_servers[0]
+    api_server_count = (
+        api_server_args.count('--api-server-count')
+        and api_server_args[api_server_args.index('--api-server-count') + 1]
+        or 1)
+    print(f"Successfully completed API-only streaming test with {DP_SIZE} "
+          f"engines on headless server (API server count: {api_server_count})")
+    # Check request balancing via Prometheus metrics
+    api_server = api_only_servers[0][0]
+    check_request_balancing(api_server, DP_SIZE)