Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
8 changed files
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -10,6 +10,12 @@ import pytest
 import pytest_asyncio
 from openai import OpenAI

+from tests.entrypoints.openai.utils import (
+    accumulate_streaming_response,
+    verify_chat_response,
+    verify_harmony_messages,
+)
+from tests.utils import RemoteOpenAIServer
 from vllm._aiter_ops import is_aiter_found_and_supported
 from vllm.config import MultiModalConfig
 from vllm.entrypoints.openai.chat_completion.protocol import (
@@ -21,8 +27,14 @@ from vllm.entrypoints.openai.engine.protocol import (
    ErrorResponse,
    RequestResponseMetadata,
 )
-from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels
+from vllm.entrypoints.openai.models.serving import (
+    BaseModelPath,
+    OpenAIModelRegistry,
+    OpenAIServingModels,
+)
 from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
+from vllm.entrypoints.serve.render.serving import OpenAIServingRender
+from vllm.exceptions import VLLMValidationError
 from vllm.inputs import TokensPrompt
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.renderers.hf import HfRenderer
@@ -33,13 +45,6 @@ from vllm.tokenizers.registry import tokenizer_args_from_config
 from vllm.tool_parsers import ToolParserManager
 from vllm.v1.engine.async_llm import AsyncLLM

-from ...utils import RemoteOpenAIServer
-from .utils import (
-    accumulate_streaming_response,
-    verify_chat_response,
-    verify_harmony_messages,
-)
-
 GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
 GPT_OSS_SPECULATOR_NAME = "RedHatAI/gpt-oss-20b-speculator.eagle3"

@@ -126,7 +131,7 @@ def gptoss_speculative_server(default_server_args: list[str]):
    if is_aiter_found_and_supported():
        env_dict = {"VLLM_ROCM_USE_AITER": "1"}
    with RemoteOpenAIServer(
-        GPT_OSS_MODEL_NAME, server_args, env_dict=env_dict
+        GPT_OSS_MODEL_NAME, server_args, env_dict=env_dict, max_wait_seconds=480
    ) as remote_server:
        yield remote_server

@@ -520,38 +525,67 @@ class MockModelConfig:
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
    logits_processors: list[str] | None = None
-    logits_processor_pattern = None
    diff_sampling_param: dict | None = None
    allowed_local_media_path: str = ""
    allowed_media_domains: list[str] | None = None
    encoder_config = None
    generation_config: str = "auto"
+    override_generation_config: dict[str, Any] = field(default_factory=dict)
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    skip_tokenizer_init: bool = False
    is_encoder_decoder: bool = False
+    is_multimodal_model: bool = False

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}


+@dataclass
+class MockParallelConfig:
+    _api_process_rank: int = 0
+
+
+@dataclass
+class MockVllmConfig:
+    model_config: MockModelConfig
+    parallel_config: MockParallelConfig
+
+
 def _build_renderer(model_config: MockModelConfig):
    _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)

-    return HfRenderer(
-        model_config,
+    return HfRenderer.from_config(
+        MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
        tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
    )


+def _build_serving_render(
+    engine, model_registry: OpenAIModelRegistry
+) -> OpenAIServingRender:
+    return OpenAIServingRender(
+        model_config=engine.model_config,
+        renderer=engine.renderer,
+        io_processor=engine.io_processor,
+        model_registry=model_registry,
+        request_logger=None,
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+    )
+
+
 def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
    models = OpenAIServingModels(
        engine_client=engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
+    openai_serving_render = _build_serving_render(engine, models.registry)
+
    serving_chat = OpenAIServingChat(
        engine,
        models,
        response_role="assistant",
+        openai_serving_render=openai_serving_render,
        chat_template=CHAT_TEMPLATE,
        chat_template_content_format="auto",
        request_logger=None,
@@ -572,10 +606,13 @@ async def _async_serving_chat_init():
    engine = MockEngine()

    models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
+    openai_serving_render = _build_serving_render(engine, models.registry)
+
    serving_completion = OpenAIServingChat(
        engine,
        models,
        response_role="assistant",
+        openai_serving_render=openai_serving_render,
        chat_template=CHAT_TEMPLATE,
        chat_template_content_format="auto",
        request_logger=None,
@@ -645,12 +682,10 @@ async def test_serving_chat_should_set_correct_max_tokens():

    assert mock_engine.generate.call_args.args[1].max_tokens == 10

-    # Setting server's max_tokens in the generation_config.json
-    # lower than context_window - prompt_tokens
+    # Model author's generation_config.json sets max_tokens (auto, no override)
+    # — should act as fallback only, not ceiling
    mock_model_config = MockModelConfig()
-    mock_model_config.diff_sampling_param = {
-        "max_tokens": 10  # Setting server-side max_tokens limit
-    }
+    mock_model_config.diff_sampling_param = {"max_tokens": 10}

    # Reinitialize the engine with new settings
    mock_engine = MagicMock(spec=AsyncLLM)
@@ -674,13 +709,14 @@ async def test_serving_chat_should_set_correct_max_tokens():

    assert mock_engine.generate.call_args.args[1].max_tokens == 10

-    # Test Case 2: Request's max_tokens set higher than server accepts
+    # Test Case 2: Request's max_tokens set higher than generation_config
+    # default so request-provided max_tokens takes precedence
    req.max_tokens = 15

    with suppress(Exception):
        await serving_chat.create_chat_completion(req)

-    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+    assert mock_engine.generate.call_args.args[1].max_tokens == 15

    # Test Case 3: Request's max_tokens set lower than server accepts
    req.max_tokens = 5
@@ -690,12 +726,52 @@ async def test_serving_chat_should_set_correct_max_tokens():

    assert mock_engine.generate.call_args.args[1].max_tokens == 5

+    # User explicitly sets max_tokens via --override-generation-config
+    # — should act as a ceiling
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {"max_tokens": 10}
+    mock_model_config.override_generation_config = {"max_new_tokens": 10}
+
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.errored = False
+    mock_engine.model_config = mock_model_config
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+    mock_engine.renderer = _build_renderer(mock_engine.model_config)
+
+    serving_chat = _build_serving_chat(mock_engine)
+
+    # Test Case 3.1: No max_tokens — uses override as default
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "what is 1+1?"}],
+    )
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 3.2: Request max_tokens higher — capped by user ceiling from override
+    req.max_tokens = 15
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 3.3: Request max_tokens lower — respected
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
    # Setting server's max_tokens in the generation_config.json
    # higher than context_window - prompt_tokens
    mock_model_config = MockModelConfig()
-    mock_model_config.diff_sampling_param = {
-        "max_tokens": 200  # Setting server-side max_tokens limit
-    }
+    mock_model_config.diff_sampling_param = {"max_tokens": 200}

    # Reinitialize the engine with new settings
    mock_engine = MagicMock(spec=AsyncLLM)
@@ -749,8 +825,10 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
    mock_engine.io_processor = MagicMock()

    mock_tokenizer = MagicMock(spec=MistralTokenizer)
-    mock_renderer = MistralRenderer(mock_engine.model_config, tokenizer_kwargs={})
-    mock_renderer._tokenizer = mock_tokenizer
+    mock_renderer = MistralRenderer(
+        MockVllmConfig(mock_engine.model_config, parallel_config=MockParallelConfig()),
+        tokenizer=mock_tokenizer,
+    )
    # Force the Mistral chat template renderer to return token IDs.
    # Choose a prompt length that is < max_model_len, but large enough that
    # adding max_tokens should exceed the model context window.
@@ -770,9 +848,8 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
        max_tokens=10,
    )

-    resp = await serving_chat.create_chat_completion(req)
-    assert isinstance(resp, ErrorResponse)
-    assert "context length is only" in resp.error.message
+    with pytest.raises(VLLMValidationError):
+        await serving_chat.create_chat_completion(req)


 @pytest.mark.asyncio
@@ -788,8 +865,10 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
    mock_engine.io_processor = MagicMock()

    mock_tokenizer = MagicMock(spec=MistralTokenizer)
-    mock_renderer = MistralRenderer(mock_engine.model_config, tokenizer_kwargs={})
-    mock_renderer._tokenizer = mock_tokenizer
+    mock_renderer = MistralRenderer(
+        MockVllmConfig(mock_engine.model_config, parallel_config=MockParallelConfig()),
+        tokenizer=mock_tokenizer,
+    )
    # prompt_token_ids length == max_model_len should be rejected for
    # completion-like requests (ChatCompletionRequest).
    mock_renderer.render_messages_async = AsyncMock(
@@ -810,9 +889,8 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
        max_tokens=1,
    )

-    resp = await serving_chat.create_chat_completion(req)
-    assert isinstance(resp, ErrorResponse)
-    assert "context length is only" in resp.error.message
+    with pytest.raises(VLLMValidationError):
+        await serving_chat.create_chat_completion(req)


 @pytest.mark.asyncio
@@ -1127,7 +1205,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req)
+        )
        verify_harmony_messages(
            input_messages,
            [
@@ -1154,7 +1234,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req_2)
+        )
        verify_harmony_messages(
            input_messages_2,
            [
@@ -1175,7 +1257,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req)
+        )
        verify_harmony_messages(
            input_messages,
            [
@@ -1219,7 +1303,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req_2)
+        )
        verify_harmony_messages(
            input_messages_2,
            [
@@ -1256,7 +1342,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req)
+        )
        verify_harmony_messages(
            input_messages,
            [
@@ -1300,7 +1388,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req_2)
+        )
        verify_harmony_messages(
            input_messages_2,
            [
@@ -1337,7 +1427,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req)
+        )
        verify_harmony_messages(
            input_messages,
            [
@@ -1381,7 +1473,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req_2)
+        )
        verify_harmony_messages(
            input_messages_2,
            [
@@ -1431,7 +1525,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the third turn's input
        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
+        input_messages_3, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req_3)
+        )
        verify_harmony_messages(
            input_messages_3,
            [
@@ -1494,7 +1590,9 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the fourth turn's input
        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
+        input_messages_4, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req_4)
+        )
        verify_harmony_messages(
            input_messages_4,
            [
@@ -1543,7 +1641,9 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req)
+        )

        verify_harmony_messages(
            input_messages,
@@ -1574,7 +1674,9 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req)
+        )

        verify_harmony_messages(
            input_messages,
@@ -1603,7 +1705,9 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = (
+            serving_chat.openai_serving_render._make_request_with_harmony(req)
+        )

        verify_harmony_messages(
            input_messages,
@@ -1634,11 +1738,14 @@ async def test_tool_choice_validation_without_parser():
        engine_client=mock_engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
+    openai_serving_render = _build_serving_render(mock_engine, models.registry)
+
    # Create serving_chat without tool_parser (enable_auto_tools=False)
    serving_chat = OpenAIServingChat(
        mock_engine,
        models,
        response_role="assistant",
+        openai_serving_render=openai_serving_render,
        chat_template=CHAT_TEMPLATE,
        chat_template_content_format="auto",
        request_logger=None,

--- a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
+++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
@@ -180,20 +180,13 @@ class TestExtractHarmonyStreamingDelta:

        assert delta_message.tool_calls[0].index == 1

-    @pytest.mark.parametrize(
-        "channel,recipient",
-        [
-            ("commentary", None),
-            ("commentary", "browser.search"),
-        ],
-    )
-    def test_returns_tool_call_preambles(self, channel, recipient):
-        """Test that invalid tool recipient on commentary is treated as content."""
+    def test_returns_preambles_as_content(self):
+        """Test that commentary with no recipient (preamble) is user content."""
        parser = MockStreamableParser()
        delta_text = "some text"

        token_states = [
-            TokenState(channel=channel, recipient=recipient, text=delta_text)
+            TokenState(channel="commentary", recipient=None, text=delta_text)
        ]

        delta_message, tools_streamed = extract_harmony_streaming_delta(
@@ -211,6 +204,7 @@ class TestExtractHarmonyStreamingDelta:
        [
            (None, None),
            ("unknown_channel", None),
+            ("commentary", "browser.search"),
        ],
    )
    def test_returns_none_for_invalid_inputs(self, channel, recipient):

--- a/vllm/model_executor/layers/quantization/kernels/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/__init__.py
--- a/tests/entrypoints/openai/test_render.py
+++ b/tests/entrypoints/openai/test_render.py
@@ -7,7 +7,7 @@ import httpx
 import pytest
 import pytest_asyncio

-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteLaunchRenderServer

 MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"

@@ -16,7 +16,7 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 def server():
    args: list[str] = []

-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteLaunchRenderServer(MODEL_NAME, args) as remote_server:
        yield remote_server


@@ -43,23 +43,20 @@ async def test_completion_render_basic(client):
    assert response.status_code == 200
    data = response.json()

-    # Verify response structure
+    # Verify response structure - list of GenerateRequest
    assert isinstance(data, list)
    assert len(data) > 0

-    # Verify first prompt
+    # Verify first prompt is a GenerateRequest
    first_prompt = data[0]
-    assert "prompt_token_ids" in first_prompt
-    assert "prompt" in first_prompt
-    assert isinstance(first_prompt["prompt_token_ids"], list)
-    assert len(first_prompt["prompt_token_ids"]) > 0
-    assert isinstance(first_prompt["prompt"], str)
-
-    # Verify prompt text is preserved
-    assert (
-        "When should a chat-completions handler return an empty string?"
-        in first_prompt["prompt"]
-    )
+    assert "token_ids" in first_prompt
+    assert "sampling_params" in first_prompt
+    assert "model" in first_prompt
+    assert "request_id" in first_prompt
+    assert isinstance(first_prompt["token_ids"], list)
+    assert len(first_prompt["token_ids"]) > 0
+    assert first_prompt["model"] == MODEL_NAME
+    assert first_prompt["request_id"].startswith("cmpl-")


 @pytest.mark.asyncio
@@ -84,36 +81,15 @@ async def test_chat_completion_render_basic(client):
    assert response.status_code == 200
    data = response.json()

-    # Verify response structure - should be [conversation, engine_prompts]
-    assert isinstance(data, list)
-    assert len(data) == 2
-
-    conversation, engine_prompts = data
-
-    # Verify conversation
-    assert isinstance(conversation, list)
-    assert len(conversation) > 0
-    assert conversation[0]["role"] == "user"
-    assert "empty string" in conversation[0]["content"]
-
-    # Verify engine_prompts
-    assert isinstance(engine_prompts, list)
-    assert len(engine_prompts) > 0
+    # Verify response structure - should be a GenerateRequest
+    assert isinstance(data, dict)
+    assert "token_ids" in data
+    assert isinstance(data["token_ids"], list)
+    assert len(data["token_ids"]) > 0

-    first_prompt = engine_prompts[0]
-    assert "prompt_token_ids" in first_prompt
-    assert "prompt" in first_prompt
-    assert isinstance(first_prompt["prompt_token_ids"], list)
-    assert len(first_prompt["prompt_token_ids"]) > 0
-
-    # Verify chat template was applied (should have instruction markers)
-    assert "[INST]" in first_prompt["prompt"]
-    assert "[/INST]" in first_prompt["prompt"]
-
-    # Verify token IDs are correctly preserved as integers
-    token_ids = first_prompt["prompt_token_ids"]
+    # Verify token IDs are integers and BOS token is present
+    token_ids = data["token_ids"]
    assert all(isinstance(tid, int) for tid in token_ids)
-    # Verify BOS token (usually 1 for LLaMA models)
    assert token_ids[0] == 1


@@ -131,15 +107,18 @@ async def test_completion_render_multiple_prompts(client):
    assert response.status_code == 200
    data = response.json()

-    # Should return two prompts
+    # Should return two GenerateRequest items
    assert isinstance(data, list)
    assert len(data) == 2

-    # Verify both prompts have required fields
+    # Verify both prompts have GenerateRequest fields
    for prompt in data:
-        assert "prompt_token_ids" in prompt
-        assert "prompt" in prompt
-        assert len(prompt["prompt_token_ids"]) > 0
+        assert "token_ids" in prompt
+        assert "sampling_params" in prompt
+        assert "model" in prompt
+        assert "request_id" in prompt
+        assert len(prompt["token_ids"]) > 0
+        assert prompt["request_id"].startswith("cmpl-")


 @pytest.mark.asyncio
@@ -160,17 +139,49 @@ async def test_chat_completion_render_multi_turn(client):
    assert response.status_code == 200
    data = response.json()

-    conversation, engine_prompts = data
+    # Verify tokenization occurred
+    assert isinstance(data, dict)
+    assert "token_ids" in data
+    assert isinstance(data["token_ids"], list)
+    assert len(data["token_ids"]) > 0

-    # Verify all messages preserved
-    assert len(conversation) == 3
-    assert conversation[0]["role"] == "user"
-    assert conversation[1]["role"] == "assistant"
-    assert conversation[2]["role"] == "user"

-    # Verify tokenization occurred
-    assert len(engine_prompts) > 0
-    assert len(engine_prompts[0]["prompt_token_ids"]) > 0
+@pytest.mark.asyncio
+async def test_chat_completion_render_with_stream_true(client):
+    """Render accepts stream params but still returns JSON (non-streamed)."""
+
+    response = await client.post(
+        "/v1/chat/completions/render",
+        json={
+            "model": MODEL_NAME,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+                "continuous_usage_stats": True,
+            },
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "Stream options should be accepted by /render.",
+                }
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    assert response.headers.get("content-type", "").startswith("application/json")
+
+    data = response.json()
+    assert isinstance(data, dict)
+    assert "token_ids" in data
+    assert isinstance(data["token_ids"], list)
+    assert len(data["token_ids"]) > 0
+
+    # /render should preserve stream fields on the returned token-in request.
+    assert data.get("stream") is True
+    assert isinstance(data.get("stream_options"), dict)
+    assert data["stream_options"].get("include_usage") is True
+    assert data["stream_options"].get("continuous_usage_stats") is True


 @pytest.mark.asyncio
@@ -224,3 +235,31 @@ async def test_completion_render_no_generation(client):
    assert response.status_code == 200
    # Render should be fast (< 1 second) since no generation
    assert elapsed < 1.0
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_render_with_sampling_params(client):
+    """Verify sampling params are correctly returned by /render."""
+    response = await client.post(
+        "/v1/chat/completions/render",
+        json={
+            "model": MODEL_NAME,
+            "messages": [{"role": "user", "content": "Test sampling params"}],
+            "temperature": 0.123,
+            "top_p": 0.456,
+            "frequency_penalty": 1.1,
+        },
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+
+    assert "sampling_params" in data
+    sampling_params = data["sampling_params"]
+
+    assert sampling_params.get("temperature") == 0.123
+    assert sampling_params.get("top_p") == 0.456
+    assert sampling_params.get("frequency_penalty") == 1.1
+
+    # Check that internal fields are not present
+    assert "_all_stop_token_ids" not in sampling_params
--- a/tests/entrypoints/openai/cpu/test_render_multimodal.py
+++ b/tests/entrypoints/openai/cpu/test_render_multimodal.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Multimodal tests for the /render endpoints that expose prompt preprocessing."""
+
+import httpx
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from vllm.multimodal.utils import encode_image_url
+
+VISION_MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def vision_server():
+    """Vision-capable server used for multimodal /render tests."""
+
+    args = [
+        "--enforce-eager",
+        "--max-model-len",
+        "100",
+        "--max-num-seqs",
+        "1",
+        "--limit-mm-per-prompt.image",
+        "1",
+        "--limit-mm-per-prompt.video",
+        "0",
+    ]
+
+    env_overrides: dict[str, str] = {}
+
+    with RemoteOpenAIServer(
+        VISION_MODEL_NAME,
+        args,
+        env_dict=env_overrides,
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def vision_client(vision_server):
+    async with httpx.AsyncClient(
+        base_url=vision_server.url_for(""), timeout=60.0
+    ) as http_client:
+        yield http_client
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_render_with_base64_image_url(
+    vision_client,
+    local_asset_server,
+):
+    """Render a multimodal chat request and verify tokens are returned."""
+
+    image = local_asset_server.get_image_asset("RGBA_comp.png")
+    data_url = encode_image_url(image, format="PNG")
+
+    assert data_url.startswith("data:image/")
+    assert ";base64," in data_url
+
+    response = await vision_client.post(
+        "/v1/chat/completions/render",
+        json={
+            "model": VISION_MODEL_NAME,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": data_url}},
+                        {"type": "text", "text": "What's in this image?"},
+                    ],
+                }
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+
+    data = response.json()
+    assert isinstance(data, dict)
+    assert "token_ids" in data
+    assert isinstance(data["token_ids"], list)
+    assert len(data["token_ids"]) > 0
+
+    # Verify multimodal features are populated
+    assert "features" in data
+    features = data["features"]
+    assert features is not None
+
+    # mm_hashes: should have an "image" key with a list of hash strings
+    assert "mm_hashes" in features
+    assert "image" in features["mm_hashes"]
+    image_hashes = features["mm_hashes"]["image"]
+    assert isinstance(image_hashes, list)
+    assert len(image_hashes) > 0
+    assert all(isinstance(h, str) for h in image_hashes)
+
+    # mm_placeholders: should have an "image" key with offset/length dicts
+    assert "mm_placeholders" in features
+    assert "image" in features["mm_placeholders"]
+    image_placeholders = features["mm_placeholders"]["image"]
+    assert isinstance(image_placeholders, list)
+    assert len(image_placeholders) > 0
+    for p in image_placeholders:
+        assert "offset" in p
+        assert "length" in p
+        assert isinstance(p["offset"], int)
+        assert isinstance(p["length"], int)
+        assert p["length"] > 0
+
+
+@pytest.mark.asyncio
+async def test_tokenize_matches_render_for_multimodal_input(
+    vision_client,
+    local_asset_server,
+):
+    """`/tokenize` should match `/v1/chat/completions/render` token output."""
+
+    image = local_asset_server.get_image_asset("RGBA_comp.png")
+    data_url = encode_image_url(image, format="PNG")
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": data_url}},
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
+
+    render_response = await vision_client.post(
+        "/v1/chat/completions/render",
+        json={
+            "model": VISION_MODEL_NAME,
+            "messages": messages,
+        },
+    )
+    assert render_response.status_code == 200
+    render_data = render_response.json()
+
+    tokenize_response = await vision_client.post(
+        "/tokenize",
+        json={
+            "model": VISION_MODEL_NAME,
+            "messages": messages,
+        },
+    )
+    assert tokenize_response.status_code == 200
+    tokenize_data = tokenize_response.json()
+
+    assert tokenize_data["tokens"] == render_data["token_ids"]
+    assert tokenize_data["count"] == len(render_data["token_ids"])
--- a/tests/entrypoints/openai/parser/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -2,31 +2,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
-from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
-from openai.types.responses.response_output_item import McpCall
-from openai_harmony import Author, Message, Role, TextContent
+from openai_harmony import Message, Role

 from tests.entrypoints.openai.utils import verify_harmony_messages
 from vllm.entrypoints.openai.parser.harmony_utils import (
    auto_drop_analysis_messages,
    get_encoding,
+    get_system_message,
    has_custom_tools,
    parse_chat_input_to_harmony_message,
    parse_chat_output,
-    parse_input_to_harmony_message,
-    parse_output_message,
+)
+from vllm.entrypoints.openai.responses.harmony import (
+    response_input_to_harmony,
+    response_previous_input_to_harmony,
 )


 class TestCommonParseInputToHarmonyMessage:
    """
    Tests for scenarios that are common to both Chat Completion
-    parse_chat_input_to_harmony_message and Responsees API
-    parse_input_to_harmony_message functions.
+    parse_chat_input_to_harmony_message and Responses API
+    response_previous_input_to_harmony functions.
    """

    @pytest.fixture(
-        params=[parse_chat_input_to_harmony_message, parse_input_to_harmony_message]
+        params=[parse_chat_input_to_harmony_message, response_previous_input_to_harmony]
    )
    def parse_function(self, request):
        return request.param
@@ -211,81 +212,6 @@ class TestCommonParseInputToHarmonyMessage:
        assert messages[0].content[1].text == "actual text"


-class TestParseInputToHarmonyMessage:
-    """
-    Tests for scenarios that are specific to the Responses API
-    parse_input_to_harmony_message function.
-    """
-
-    def test_message_with_empty_content(self):
-        """Test parsing message with empty string content."""
-        chat_msg = {
-            "role": "user",
-            "content": "",
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].content[0].text == ""
-
-    def test_tool_message_with_string_content(self):
-        """Test parsing tool message with string content."""
-        chat_msg = {
-            "role": "tool",
-            "name": "get_weather",
-            "content": "The weather in San Francisco is sunny, 72°F",
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.TOOL
-        assert messages[0].author.name == "functions.get_weather"
-        assert (
-            messages[0].content[0].text == "The weather in San Francisco is sunny, 72°F"
-        )
-        assert messages[0].channel == "commentary"
-
-    def test_tool_message_with_array_content(self):
-        """Test parsing tool message with array content."""
-        chat_msg = {
-            "role": "tool",
-            "name": "search_results",
-            "content": [
-                {"type": "text", "text": "Result 1: "},
-                {"type": "text", "text": "Result 2: "},
-                {
-                    "type": "image",
-                    "url": "http://example.com/img.png",
-                },  # Should be ignored
-                {"type": "text", "text": "Result 3"},
-            ],
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.TOOL
-        assert messages[0].author.name == "functions.search_results"
-        assert messages[0].content[0].text == "Result 1: Result 2: Result 3"
-
-    def test_tool_message_with_empty_content(self):
-        """Test parsing tool message with None content."""
-        chat_msg = {
-            "role": "tool",
-            "name": "empty_tool",
-            "content": None,
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.TOOL
-        assert messages[0].author.name == "functions.empty_tool"
-        assert messages[0].content[0].text == ""
-
-
 class TestParseChatInputToHarmonyMessage:
    """
    Tests for scenarios that are specific to the Chat Completion API
@@ -840,192 +766,47 @@ class TestParseChatOutput:
        assert reasoning == "I've thought hard about this."
        assert final_content == "The answer is 4."

+    def test_parse_chat_output_commentary_with_recipient_excluded(self) -> None:
+        """Commentary with a recipient (tool call) should not appear in
+        final_content — those are handled separately by the tool parser.

-class TestParseOutputMessage:
-    """Tests for parse_output_message function."""
-
-    def test_commentary_with_no_recipient_creates_reasoning(self):
-        """Test that commentary with recipient=None (preambles) creates reasoning items.
-
-        Per Harmony format, commentary channel can contain preambles to calling
-        multiple functions - explanatory text with no recipient.
+        The first message is a preamble (visible), the second is a tool
+        call (excluded). Only the preamble should appear in final_content.
        """
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "I will now search for the weather information."
-        )
-        message = message.with_channel("commentary")
-        # recipient is None by default, representing a preamble
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert (
-            output_items[0].content[0].text
-            == "I will now search for the weather information."
-        )
-        assert output_items[0].content[0].type == "reasoning_text"
-
-    def test_commentary_with_function_recipient_creates_function_call(self):
-        """Test commentary with recipient='functions.X' creates function calls."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}'
-        )
-        message = message.with_channel("commentary")
-        message = message.with_recipient("functions.get_weather")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseFunctionToolCall)
-        assert output_items[0].type == "function_call"
-        assert output_items[0].name == "get_weather"
-        assert (
-            output_items[0].arguments
-            == '{"location": "San Francisco", "units": "celsius"}'
-        )
-        assert output_items[0].call_id.startswith("call_")
-        assert output_items[0].id.startswith("fc_")
-
-    def test_commentary_with_python_recipient_creates_reasoning(self):
-        """Test that commentary with recipient='python' creates reasoning items."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))"
-        )
-        message = message.with_channel("commentary")
-        message = message.with_recipient("python")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert (
-            output_items[0].content[0].text
-            == "import numpy as np\nprint(np.array([1, 2, 3]))"
-        )
-
-    def test_commentary_with_browser_recipient_creates_reasoning(self):
-        """Test that commentary with recipient='browser' creates reasoning items."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "Navigating to the specified URL"
-        )
-        message = message.with_channel("commentary")
-        message = message.with_recipient("browser")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert output_items[0].content[0].text == "Navigating to the specified URL"
-
-    def test_commentary_with_container_recipient_creates_reasoning(self):
-        """Test that commentary with recipient='container' creates reasoning items."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "Running command in container"
-        )
-        message = message.with_channel("commentary")
-        message = message.with_recipient("container")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert output_items[0].content[0].text == "Running command in container"
-
-    def test_commentary_with_empty_content_and_no_recipient(self):
-        """Test edge case: empty commentary with recipient=None."""
-        message = Message.from_role_and_content(Role.ASSISTANT, "")
-        message = message.with_channel("commentary")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].content[0].text == ""
-
-    def test_commentary_with_multiple_contents_and_no_recipient(self):
-        """Test multiple content items in commentary with no recipient."""
-        contents = [
-            TextContent(text="Step 1: Analyze the request"),
-            TextContent(text="Step 2: Prepare to call functions"),
-        ]
-        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
-        message = message.with_channel("commentary")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 2
-        assert all(isinstance(item, ResponseReasoningItem) for item in output_items)
-        assert output_items[0].content[0].text == "Step 1: Analyze the request"
-        assert output_items[1].content[0].text == "Step 2: Prepare to call functions"
-
-    def test_commentary_with_multiple_function_calls(self):
-        """Test multiple function calls in commentary channel."""
-        contents = [
-            TextContent(text='{"location": "San Francisco"}'),
-            TextContent(text='{"location": "New York"}'),
-        ]
-        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
-        message = message.with_channel("commentary")
-        message = message.with_recipient("functions.get_weather")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 2
-        assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items)
-        assert output_items[0].name == "get_weather"
-        assert output_items[1].name == "get_weather"
-        assert output_items[0].arguments == '{"location": "San Francisco"}'
-        assert output_items[1].arguments == '{"location": "New York"}'
-
-    def test_commentary_with_unknown_recipient_creates_mcp_call(self):
-        """Test that commentary with unknown recipient creates MCP call."""
-        message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
-        message = message.with_channel("commentary")
-        message = message.with_recipient("custom_tool")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], McpCall)
-        assert output_items[0].type == "mcp_call"
-        assert output_items[0].name == "custom_tool"
-        assert output_items[0].server_label == "custom_tool"
-
-    def test_analysis_channel_creates_reasoning(self):
-        """Test that analysis channel creates reasoning items."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "Analyzing the problem step by step..."
-        )
-        message = message.with_channel("analysis")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert (
-            output_items[0].content[0].text == "Analyzing the problem step by step..."
+        harmony_str = (
+            "<|channel|>commentary"
+            "<|message|>Let me check the weather.<|end|>"
+            "<|start|>assistant to=functions.get_weather"
+            "<|channel|>commentary"
+            '<|message|>{"location": "SF"}<|end|>'
        )
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning is None
+        assert final_content == "Let me check the weather."

-    def test_non_assistant_message_returns_empty(self):
-        """Test that non-assistant messages return empty list.
+    def test_parse_chat_output_interrupted_preamble(self) -> None:
+        """Partial/interrupted preamble (commentary without recipient) should
+        appear in final_content, not reasoning."""
+        harmony_str = "<|channel|>commentary<|message|>I'll search for that"
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning is None
+        assert final_content == "I'll search for that"

-        Per the implementation, tool messages to assistant (e.g., search results)
-        are not included in final output to align with OpenAI behavior.
-        """
-        message = Message.from_author_and_content(
-            Author.new(Role.TOOL, "functions.get_weather"),
-            "The weather is sunny, 72°F",
+    def test_parse_chat_output_preamble_then_final(self) -> None:
+        """Preamble followed by a final message should both appear in
+        final_content, joined by newline."""
+        harmony_str = (
+            "<|channel|>commentary"
+            "<|message|>Let me look that up.<|end|>"
+            "<|start|>assistant<|channel|>final"
+            "<|message|>The answer is 42.<|end|>"
        )
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 0
+        token_ids = get_encoding().encode(harmony_str, allowed_special="all")
+        reasoning, final_content, _ = parse_chat_output(token_ids)
+        assert reasoning is None
+        assert final_content == "Let me look that up.\nThe answer is 42."


 def test_has_custom_tools() -> None:
@@ -1037,165 +818,113 @@ def test_has_custom_tools() -> None:
    )


-def test_parse_mcp_call_basic() -> None:
-    """Test that MCP calls are parsed with correct type and server_label."""
-    message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}')
-    message = message.with_recipient("filesystem")
-    message = message.with_channel("commentary")
+class TestGetSystemMessage:
+    """Tests for get_system_message channel configuration."""

-    output_items = parse_output_message(message)
+    def test_commentary_channel_present_without_custom_tools(self) -> None:
+        """Commentary channel must be valid even without custom tools."""
+        sys_msg = get_system_message(with_custom_tools=False)
+        valid_channels = sys_msg.content[0].channel_config.valid_channels
+        assert "commentary" in valid_channels

-    assert len(output_items) == 1
-    assert isinstance(output_items[0], McpCall)
-    assert output_items[0].type == "mcp_call"
-    assert output_items[0].name == "filesystem"
-    assert output_items[0].server_label == "filesystem"
-    assert output_items[0].arguments == '{"path": "/tmp"}'
-    assert output_items[0].status == "completed"
+    def test_commentary_channel_present_with_custom_tools(self) -> None:
+        """Commentary channel present when custom tools are enabled."""
+        sys_msg = get_system_message(with_custom_tools=True)
+        valid_channels = sys_msg.content[0].channel_config.valid_channels
+        assert "commentary" in valid_channels

+    def test_all_standard_channels_present(self) -> None:
+        """All three standard Harmony channels should always be valid."""
+        for with_tools in (True, False):
+            sys_msg = get_system_message(with_custom_tools=with_tools)
+            valid_channels = sys_msg.content[0].channel_config.valid_channels
+            for channel in ("analysis", "commentary", "final"):
+                assert channel in valid_channels, (
+                    f"{channel} missing when with_custom_tools={with_tools}"
+                )

-def test_parse_mcp_call_dotted_recipient() -> None:
-    """Test that dotted recipients extract the tool name correctly."""
-    message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}')
-    message = message.with_recipient("repo_browser.list")
-    message = message.with_channel("commentary")

-    output_items = parse_output_message(message)
+class TestResponseInputToHarmonyReasoningItem:
+    """Tests for response_input_to_harmony handling of reasoning input items.

-    assert len(output_items) == 1
-    assert isinstance(output_items[0], McpCall)
-    assert output_items[0].name == "list"
-    assert output_items[0].server_label == "repo_browser"
+    Per the OpenAI spec, ResponseReasoningItem.content is
+    Optional[List[Content]] = None. Clients like langchain-openai may omit
+    this field when constructing multi-turn input from previous responses.

+    Reasoning items with content are converted to Harmony messages on the
+    'analysis' channel. All content items are concatenated. Items without
+    content return None (skipped by the caller).
+    """

-def test_mcp_vs_function_call() -> None:
-    """Test that function calls are not parsed as MCP calls."""
-    func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
-    func_message = func_message.with_recipient("functions.my_tool")
-    func_message = func_message.with_channel("commentary")
+    def test_reasoning_with_single_content(self):
+        """Test reasoning item with a single content entry."""
+        item = {
+            "type": "reasoning",
+            "id": "rs_123",
+            "content": [{"type": "reasoning_text", "text": "Thinking step by step"}],
+        }

-    func_items = parse_output_message(func_message)
+        msg = response_input_to_harmony(item, prev_responses=[])

-    assert len(func_items) == 1
-    assert not isinstance(func_items[0], McpCall)
-    assert func_items[0].type == "function_call"
+        assert msg is not None
+        assert msg.author.role == Role.ASSISTANT
+        assert msg.content[0].text == "Thinking step by step"
+        assert msg.channel == "analysis"

+    def test_reasoning_with_multiple_content_items(self):
+        """Test reasoning item with multiple content entries concatenated."""
+        item = {
+            "type": "reasoning",
+            "id": "rs_123",
+            "content": [
+                {"type": "reasoning_text", "text": "First, let me analyze"},
+                {"type": "reasoning_text", "text": "Second, I should consider"},
+                {"type": "reasoning_text", "text": "Finally, the answer is"},
+            ],
+        }
+
+        msg = response_input_to_harmony(item, prev_responses=[])
+
+        assert msg is not None
+        assert msg.author.role == Role.ASSISTANT
+        assert msg.content[0].text == (
+            "First, let me analyze\nSecond, I should consider\nFinally, the answer is"
+        )
+        assert msg.channel == "analysis"
+
+    def test_reasoning_without_content_returns_none(self):
+        """Test reasoning item without content field returns None."""
+        item = {
+            "type": "reasoning",
+            "id": "rs_123",
+            "summary": [{"type": "summary_text", "text": "Thinking about math"}],
+        }
+
+        msg = response_input_to_harmony(item, prev_responses=[])

-def test_mcp_vs_builtin_tools() -> None:
-    """Test that built-in tools (python, container) are not parsed as MCP calls."""
-    # Test python (built-in tool) - should be reasoning, not MCP
-    python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')")
-    python_message = python_message.with_recipient("python")
-    python_message = python_message.with_channel("commentary")
+        assert msg is None

-    python_items = parse_output_message(python_message)
+    def test_reasoning_with_none_content_returns_none(self):
+        """Test reasoning item with content=None returns None."""
+        item = {
+            "type": "reasoning",
+            "id": "rs_123",
+            "content": None,
+            "summary": [{"type": "summary_text", "text": "Thinking about math"}],
+        }

-    assert len(python_items) == 1
-    assert not isinstance(python_items[0], McpCall)
-    assert python_items[0].type == "reasoning"
+        msg = response_input_to_harmony(item, prev_responses=[])
+
+        assert msg is None
+
+    def test_reasoning_with_empty_content_returns_none(self):
+        """Test reasoning item with empty content list returns None."""
+        item = {
+            "type": "reasoning",
+            "id": "rs_123",
+            "content": [],
+        }

+        msg = response_input_to_harmony(item, prev_responses=[])

-def test_parse_remaining_state_commentary_channel() -> None:
-    """Test parse_remaining_state with commentary channel and various recipients."""
-    from unittest.mock import Mock
-
-    from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
-
-    # Test 1: functions.* recipient → should return function tool call
-    parser_func = Mock()
-    parser_func.current_content = '{"arg": "value"}'
-    parser_func.current_role = Role.ASSISTANT
-    parser_func.current_channel = "commentary"
-    parser_func.current_recipient = "functions.my_tool"
-
-    func_items = parse_remaining_state(parser_func)
-
-    assert len(func_items) == 1
-    assert not isinstance(func_items[0], McpCall)
-    assert func_items[0].type == "function_call"
-    assert func_items[0].name == "my_tool"
-    assert func_items[0].status == "in_progress"
-
-    # Test 2: MCP tool (not builtin) → should return MCP call
-    parser_mcp = Mock()
-    parser_mcp.current_content = '{"path": "/tmp"}'
-    parser_mcp.current_role = Role.ASSISTANT
-    parser_mcp.current_channel = "commentary"
-    parser_mcp.current_recipient = "filesystem"
-
-    mcp_items = parse_remaining_state(parser_mcp)
-
-    assert len(mcp_items) == 1
-    assert isinstance(mcp_items[0], McpCall)
-    assert mcp_items[0].type == "mcp_call"
-    assert mcp_items[0].name == "filesystem"
-    assert mcp_items[0].server_label == "filesystem"
-    assert mcp_items[0].status == "in_progress"
-
-    # Test 3: Built-in tool (python)
-    # should NOT return MCP call, falls through to reasoning
-    parser_builtin = Mock()
-    parser_builtin.current_content = "print('hello')"
-    parser_builtin.current_role = Role.ASSISTANT
-    parser_builtin.current_channel = "commentary"
-    parser_builtin.current_recipient = "python"
-
-    builtin_items = parse_remaining_state(parser_builtin)
-
-    # Should fall through to reasoning logic
-    assert len(builtin_items) == 1
-    assert not isinstance(builtin_items[0], McpCall)
-    assert builtin_items[0].type == "reasoning"
-
-
-def test_parse_remaining_state_analysis_channel() -> None:
-    """Test parse_remaining_state with analysis channel and various recipients."""
-    from unittest.mock import Mock
-
-    from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
-
-    # Test 1: functions.* recipient → should return function tool call
-    parser_func = Mock()
-    parser_func.current_content = '{"arg": "value"}'
-    parser_func.current_role = Role.ASSISTANT
-    parser_func.current_channel = "analysis"
-    parser_func.current_recipient = "functions.my_tool"
-
-    func_items = parse_remaining_state(parser_func)
-
-    assert len(func_items) == 1
-    assert not isinstance(func_items[0], McpCall)
-    assert func_items[0].type == "function_call"
-    assert func_items[0].name == "my_tool"
-    assert func_items[0].status == "in_progress"
-
-    # Test 2: MCP tool (not builtin) → should return MCP call
-    parser_mcp = Mock()
-    parser_mcp.current_content = '{"query": "test"}'
-    parser_mcp.current_role = Role.ASSISTANT
-    parser_mcp.current_channel = "analysis"
-    parser_mcp.current_recipient = "database"
-
-    mcp_items = parse_remaining_state(parser_mcp)
-
-    assert len(mcp_items) == 1
-    assert isinstance(mcp_items[0], McpCall)
-    assert mcp_items[0].type == "mcp_call"
-    assert mcp_items[0].name == "database"
-    assert mcp_items[0].server_label == "database"
-    assert mcp_items[0].status == "in_progress"
-
-    # Test 3: Built-in tool (container)
-    # should NOT return MCP call, falls through to reasoning
-    parser_builtin = Mock()
-    parser_builtin.current_content = "docker run"
-    parser_builtin.current_role = Role.ASSISTANT
-    parser_builtin.current_channel = "analysis"
-    parser_builtin.current_recipient = "container"
-
-    builtin_items = parse_remaining_state(parser_builtin)
-
-    # Should fall through to reasoning logic
-    assert len(builtin_items) == 1
-    assert not isinstance(builtin_items[0], McpCall)
-    assert builtin_items[0].type == "reasoning"
+        assert msg is None
--- a/tests/entrypoints/openai/responses/conftest.py
+++ b/tests/entrypoints/openai/responses/conftest.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import json
+import logging
+from collections.abc import Callable
+from typing import Any
+
 import pytest

+logger = logging.getLogger(__name__)
+
+BASE_TEST_ENV = {
+    # The day vLLM said "hello world" on arxiv 🚀
+    "VLLM_SYSTEM_START_DATE": "2023-09-12",
+}
+DEFAULT_MAX_RETRIES = 3
+

 @pytest.fixture
 def pairs_of_event_types() -> dict[str, str]:
@@ -24,7 +39,325 @@ def pairs_of_event_types() -> dict[str, str]:
        "response.mcp_call.completed": "response.mcp_call.in_progress",
        "response.function_call_arguments.done": "response.function_call_arguments.delta", # noqa: E501
        "response.code_interpreter_call_code.done": "response.code_interpreter_call_code.delta", # noqa: E501
+        "response.code_interpreter_call.completed": "response.code_interpreter_call.in_progress", # noqa: E501
        "response.web_search_call.completed": "response.web_search_call.in_progress",
    }
    # fmt: on
    return event_pairs
+
+
+async def retry_for_tool_call(
+    client,
+    *,
+    model: str,
+    expected_tool_type: str,
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    **create_kwargs: Any,
+):
+    """Call ``client.responses.create`` up to *max_retries* times, returning
+    the first response that contains an output item of *expected_tool_type*.
+
+    Returns the **last** response if none match so the caller's assertions
+    fire with a clear diagnostic.
+    """
+    last_response = None
+    for attempt in range(max_retries):
+        response = await client.responses.create(model=model, **create_kwargs)
+        last_response = response
+        if any(
+            getattr(item, "type", None) == expected_tool_type
+            for item in response.output
+        ):
+            return response
+    assert last_response is not None
+    return last_response
+
+
+async def retry_streaming_for(
+    client,
+    *,
+    model: str,
+    validate_events: Callable[[list], bool],
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    **create_kwargs: Any,
+) -> list:
+    """Call ``client.responses.create(stream=True)`` up to *max_retries*
+    times, returning the first event list where *validate_events* returns
+    ``True``.
+    """
+    last_events: list = []
+    for attempt in range(max_retries):
+        stream = await client.responses.create(
+            model=model, stream=True, **create_kwargs
+        )
+        events: list = []
+        async for event in stream:
+            events.append(event)
+        last_events = events
+        if validate_events(events):
+            return events
+    return last_events
+
+
+def has_output_type(response, type_name: str) -> bool:
+    """Return True if *response* has at least one output item of *type_name*."""
+    return any(getattr(item, "type", None) == type_name for item in response.output)
+
+
+def events_contain_type(events: list, type_substring: str) -> bool:
+    """Return True if any event's type contains *type_substring*."""
+    return any(type_substring in getattr(e, "type", "") for e in events)
+
+
+def _validate_event_pairing(events: list, pairs_of_event_types: dict[str, str]) -> None:
+    """Validate that streaming events are properly nested/paired.
+
+    Derives push/pop sets from *pairs_of_event_types* so that every
+    start/end pair in the dict is handled automatically.
+    """
+    start_events = set(pairs_of_event_types.values())
+    end_events = set(pairs_of_event_types.keys())
+
+    stack: list[str] = []
+    for event in events:
+        etype = event.type
+        if etype in end_events:
+            expected_start = pairs_of_event_types[etype]
+            assert stack and stack[-1] == expected_start, (
+                f"Stack mismatch for {etype}: "
+                f"expected {expected_start}, "
+                f"got {stack[-1] if stack else '<empty>'}"
+            )
+            stack.pop()
+        elif etype in start_events:
+            # Consecutive deltas of the same type share a single stack slot.
+            if etype.endswith("delta") and stack and stack[-1] == etype:
+                continue
+            stack.append(etype)
+        # else: passthrough event (e.g. response.in_progress,
+        # web_search_call.searching, code_interpreter_call.interpreting)
+    assert len(stack) == 0, f"Unclosed events on stack: {stack}"
+
+
+def _validate_event_ordering(events: list) -> None:
+    """Validate that envelope events appear in the correct positions."""
+    assert len(events) >= 2, f"Expected at least 2 events, got {len(events)}"
+
+    # First event must be response.created
+    assert events[0].type == "response.created", (
+        f"First event must be response.created, got {events[0].type}"
+    )
+    # Last event must be response.completed
+    assert events[-1].type == "response.completed", (
+        f"Last event must be response.completed, got {events[-1].type}"
+    )
+
+    # response.in_progress, if present, must be the second event
+    in_progress_indices = [
+        i for i, e in enumerate(events) if e.type == "response.in_progress"
+    ]
+    if in_progress_indices:
+        assert in_progress_indices == [1], (
+            f"response.in_progress must be the second event, "
+            f"found at indices {in_progress_indices}"
+        )
+
+    # Exactly one created and one completed
+    created_count = sum(1 for e in events if e.type == "response.created")
+    completed_count = sum(1 for e in events if e.type == "response.completed")
+    assert created_count == 1, (
+        f"Expected exactly 1 response.created, got {created_count}"
+    )
+    assert completed_count == 1, (
+        f"Expected exactly 1 response.completed, got {completed_count}"
+    )
+
+
+def _validate_field_consistency(events: list) -> None:
+    """Validate item_id, output_index, and content_index consistency.
+
+    Tracks the active output item established by ``output_item.added``
+    and verifies that all subsequent events for that item carry matching
+    identifiers until ``output_item.done`` closes it.
+    """
+    _SESSION_EVENTS = {
+        "response.created",
+        "response.in_progress",
+        "response.completed",
+    }
+
+    active_item_id: str | None = None
+    active_output_index: int | None = None
+    last_output_index: int = -1
+    active_content_index: int | None = None
+
+    for event in events:
+        etype = event.type
+
+        if etype in _SESSION_EVENTS:
+            continue
+
+        # --- output_item.added: opens a new item ------------------
+        if etype == "response.output_item.added":
+            item = getattr(event, "item", None)
+            output_index = getattr(event, "output_index", None)
+
+            assert item is not None, "output_item.added must have an item"
+            item_id = getattr(item, "id", None)
+            assert item_id, "output_item.added item must have an id"
+
+            # output_index must be non-decreasing across items
+            if output_index is not None:
+                assert output_index >= last_output_index, (
+                    f"output_index went backwards: {output_index} < {last_output_index}"
+                )
+                last_output_index = output_index
+
+            active_item_id = item_id
+            active_output_index = output_index
+            active_content_index = None
+            continue
+
+        # --- output_item.done: closes the active item -------------
+        if etype == "response.output_item.done":
+            item = getattr(event, "item", None)
+            output_index = getattr(event, "output_index", None)
+
+            assert item is not None, "output_item.done must have an item"
+            done_item_id = getattr(item, "id", None)
+
+            if active_item_id is not None and done_item_id:
+                assert done_item_id == active_item_id, (
+                    f"output_item.done item.id mismatch: "
+                    f"expected {active_item_id}, got {done_item_id}"
+                )
+            if active_output_index is not None and output_index is not None:
+                assert output_index == active_output_index, (
+                    f"output_item.done output_index mismatch: "
+                    f"expected {active_output_index}, got {output_index}"
+                )
+
+            active_item_id = None
+            active_output_index = None
+            active_content_index = None
+            continue
+
+        # --- content_part / reasoning_part added: sets content_index
+        if etype in (
+            "response.content_part.added",
+            "response.reasoning_part.added",
+        ):
+            _assert_item_fields(event, etype, active_item_id, active_output_index)
+            active_content_index = getattr(event, "content_index", None)
+            continue
+
+        # --- all other item-level events --------------------------
+        _assert_item_fields(event, etype, active_item_id, active_output_index)
+
+        # content_index (only meaningful on events that carry it)
+        content_index = getattr(event, "content_index", None)
+        if content_index is not None and active_content_index is not None:
+            assert content_index == active_content_index, (
+                f"{etype} content_index mismatch: "
+                f"expected {active_content_index}, got {content_index}"
+            )
+
+
+def _assert_item_fields(
+    event,
+    etype: str,
+    active_item_id: str | None,
+    active_output_index: int | None,
+) -> None:
+    """Check that *event*'s item_id and output_index match the active item."""
+    event_item_id = getattr(event, "item_id", None)
+    output_index = getattr(event, "output_index", None)
+
+    if active_item_id is not None and event_item_id is not None:
+        assert event_item_id == active_item_id, (
+            f"{etype} item_id mismatch: expected {active_item_id}, got {event_item_id}"
+        )
+    if active_output_index is not None and output_index is not None:
+        assert output_index == active_output_index, (
+            f"{etype} output_index mismatch: "
+            f"expected {active_output_index}, got {output_index}"
+        )
+
+
+def validate_streaming_event_stack(
+    events: list, pairs_of_event_types: dict[str, str]
+) -> None:
+    """Validate streaming events: pairing, ordering, and field consistency.
+
+    Checks three aspects:
+    1. **Event pairing** — start/end events are properly nested
+       (stack-based matching derived from *pairs_of_event_types*).
+    2. **Event ordering** — envelope events (``created``,
+       ``in_progress``, ``completed``) appear at the correct positions.
+    3. **Field consistency** — ``item_id``, ``output_index``, and
+       ``content_index`` are consistent across related events within
+       each output item's lifecycle.
+    """
+    _validate_event_pairing(events, pairs_of_event_types)
+    _validate_event_ordering(events)
+    _validate_field_consistency(events)
+
+
+def log_response_diagnostics(
+    response,
+    *,
+    label: str = "Response Diagnostics",
+) -> dict[str, Any]:
+    """Extract and log diagnostic info from a Responses API response.
+
+    Logs reasoning, tool-call attempts, MCP items, and output types so
+    that CI output (``pytest -s`` or ``--log-cli-level=INFO``) gives
+    full visibility into model behaviour even on passing runs.
+
+    Returns the extracted data so callers can make additional assertions
+    if needed.
+    """
+    reasoning_texts = [
+        text
+        for item in response.output
+        if getattr(item, "type", None) == "reasoning"
+        for content in getattr(item, "content", [])
+        if (text := getattr(content, "text", None))
+    ]
+
+    tool_call_attempts = [
+        {
+            "recipient": msg.get("recipient"),
+            "channel": msg.get("channel"),
+        }
+        for msg in response.output_messages
+        if (msg.get("recipient") or "").startswith("python")
+    ]
+
+    mcp_items = [
+        {
+            "name": getattr(item, "name", None),
+            "status": getattr(item, "status", None),
+        }
+        for item in response.output
+        if getattr(item, "type", None) == "mcp_call"
+    ]
+
+    output_types = [getattr(o, "type", None) for o in response.output]
+
+    diagnostics = {
+        "model_attempted_tool_calls": bool(tool_call_attempts),
+        "tool_call_attempts": tool_call_attempts,
+        "mcp_items": mcp_items,
+        "reasoning": reasoning_texts,
+        "output_text": response.output_text,
+        "output_types": output_types,
+    }
+
+    logger.info(
+        "\n====== %s ======\n%s\n==============================",
+        label,
+        json.dumps(diagnostics, indent=2, default=str),
+    )
+
+    return diagnostics
--- a/tests/entrypoints/openai/responses/test_errors.py
+++ b/tests/entrypoints/openai/responses/test_errors.py
@@ -6,7 +6,6 @@ from unittest.mock import MagicMock

 import pytest

-from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing


@@ -38,32 +37,6 @@ async def test_raise_if_error_raises_generation_error():
    serving._raise_if_error(None, "test-request-id")  # should not raise


-@pytest.mark.asyncio
-async def test_convert_generation_error_to_response():
-    """test _convert_generation_error_to_response creates proper ErrorResponse"""
-    mock_engine = MagicMock()
-    mock_engine.model_config = MagicMock()
-    mock_engine.model_config.max_model_len = 100
-    mock_models = MagicMock()
-
-    serving = OpenAIServing(
-        engine_client=mock_engine,
-        models=mock_models,
-        request_logger=None,
-    )
-
-    # create a GenerationError
-    gen_error = GenerationError("Internal server error")
-
-    # convert to ErrorResponse
-    error_response = serving._convert_generation_error_to_response(gen_error)
-
-    assert isinstance(error_response, ErrorResponse)
-    assert error_response.error.type == "InternalServerError"
-    assert error_response.error.message == "Internal server error"
-    assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
-
-
 @pytest.mark.asyncio
 async def test_convert_generation_error_to_streaming_response():
    """test _convert_generation_error_to_streaming_response output"""