fix(frontend): vllm processor works with stream_interval > 1 (#6816)

Signed-off-by: Graham King <grahamk@nvidia.com>

fix(frontend): vllm processor works with stream_interval > 1 (#6816)
Signed-off-by: Graham King <grahamk@nvidia.com>
4e1bd700 · Graham King · GitHub · 35f99f93 · 4e1bd700 · 4e1bd700
Unverified Commit 4e1bd700 authored Mar 04, 2026 by Graham King Committed by GitHub Mar 04, 2026
7 changed files
--- a/components/src/dynamo/frontend/prepost.py
+++ b/components/src/dynamo/frontend/prepost.py
@@ -8,8 +8,13 @@ from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import Any
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
-from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
 from vllm.reasoning import ReasoningParser
 from vllm.renderers import ChatParams
 from vllm.sampling_params import SamplingParams
@@ -255,6 +260,11 @@ class StreamingPostProcessor:
        self.previous_token_ids: list[int] = []
        self.reasoning_is_done = False
        self.in_progress_tool_calls: dict[int, DeltaToolCall] = {}
+        # Buffer for post-reasoning tool text when </think> and <tool_call>
+        # arrive in the same chunk.  The streaming tool parser cannot handle
+        # this correctly, so we accumulate text here and fall back to the
+        # non-streaming extract_tool_calls() once the buffer is complete.
+        self._tool_text_buffer: str | None = None
    @staticmethod
    def _merge_tool_call(
@@ -290,6 +300,102 @@ class StreamingPostProcessor:
            stripped = stripped.replace(marker, "")
        return stripped.strip() == ""
+    def _should_parse_tools(self) -> bool:
+        return (
+            self.tool_parser is not None
+            and self.request_for_sampling.tool_choice != "none"
+        )
+    @staticmethod
+    def _compose_delta_message(
+        reasoning: str | None, content: str | None
+    ) -> DeltaMessage | None:
+        delta_message = DeltaMessage(reasoning=reasoning, content=content)
+        if not delta_message.reasoning and not delta_message.content:
+            return None
+        return delta_message
+    def _add_tool_call_from_extracted(self, index: int, tool_call: Any) -> None:
+        tool_delta = DeltaToolCall(
+            index=index,
+            type="function",
+            id=(tool_call.id if tool_call.id else make_tool_call_id()),
+            function=DeltaFunctionCall(
+                name=tool_call.function.name,
+                arguments=tool_call.function.arguments,
+            ),
+        )
+        existing = self.in_progress_tool_calls.get(index)
+        self.in_progress_tool_calls[index] = self._merge_tool_call(existing, tool_delta)
+    def _extract_tool_calls_from_text(
+        self, text: str, *, saved_reasoning: str | None = None
+    ) -> DeltaMessage | None:
+        if self.tool_parser is None:
+            return self._compose_delta_message(saved_reasoning, None)
+        extracted = self.tool_parser.extract_tool_calls(text, self.request_for_sampling)
+        if extracted.tools_called:
+            for i, tool_call in enumerate(extracted.tool_calls):
+                self._add_tool_call_from_extracted(i, tool_call)
+            return self._compose_delta_message(saved_reasoning, None)
+        return self._compose_delta_message(saved_reasoning, extracted.content or None)
+    def _extract_tool_calls_streaming(
+        self,
+        *,
+        current_text: str,
+        delta_text: str,
+        delta_token_ids: list[int],
+        current_token_ids: list[int],
+    ) -> DeltaMessage | None:
+        if self.tool_parser is None:
+            return None
+        return self.tool_parser.extract_tool_calls_streaming(
+            previous_text=self.previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=self.previous_token_ids,
+            current_token_ids=current_token_ids,
+            delta_token_ids=delta_token_ids,
+            request=self.request_for_sampling,
+        )
+    def _merge_streaming_tool_calls(self, tool_calls: list[DeltaToolCall]) -> None:
+        for tool_delta in tool_calls:
+            existing = self.in_progress_tool_calls.get(tool_delta.index)
+            merged = self._merge_tool_call(existing, tool_delta)
+            self.in_progress_tool_calls[tool_delta.index] = merged
+    def _dump_in_progress_tool_calls(self) -> list[dict[str, Any]]:
+        return [
+            tool_call.model_dump(exclude_none=True)
+            for _, tool_call in self.in_progress_tool_calls.items()
+        ]
+    def _emit_tool_calls_choice(self, output: Any) -> dict[str, Any]:
+        choice = {
+            "index": output.index,
+            "delta": {
+                "role": "assistant",
+                "tool_calls": self._dump_in_progress_tool_calls(),
+            },
+            "finish_reason": output.finish_reason,
+            "logprobs": output.logprobs,
+        }
+        self.in_progress_tool_calls.clear()
+        return choice
+    @staticmethod
+    def _build_choice(output: Any, delta: dict[str, Any]) -> dict[str, Any]:
+        return {
+            "index": output.index,
+            "delta": delta,
+            "finish_reason": output.finish_reason,
+            "logprobs": output.logprobs,
+        }
    def process_output(self, output: Any) -> dict[str, Any] | None:
        delta_token_ids = list(output.token_ids or [])
        # vLLM output_processor already applies stop-token/stop-string trimming
@@ -306,19 +412,36 @@ class StreamingPostProcessor:
                delta = {}
            else:
                return None
-            return {
+            return self._build_choice(output, delta)
-                "index": output.index,
-                "delta": delta,
-                "finish_reason": output.finish_reason,
-                "logprobs": output.logprobs,
-            }
        current_text = self.previous_text + delta_text
        current_token_ids = self.previous_token_ids + delta_token_ids
        delta_message: DeltaMessage | None = DeltaMessage(content=delta_text)
-        if not self.reasoning_is_done and self.reasoning_parser:
+        # ------------------------------------------------------------------
+        # Drain the tool-text buffer (populated when </think> and <tool_call>
+        # arrived in the same chunk).  The streaming tool parser cannot
+        # handle that transition correctly, so we accumulate text here and
+        # use the non-streaming extract_tool_calls() once complete.
+        # ------------------------------------------------------------------
+        if self._tool_text_buffer is not None:
+            self._tool_text_buffer += delta_text
+            tool_call_end = getattr(self.tool_parser, "tool_call_end_token", None)
+            buffer_complete = (
+                tool_call_end and tool_call_end in self._tool_text_buffer
+            ) or output.finish_reason
+            if buffer_complete:
+                buffered_text = self._tool_text_buffer
+                self._tool_text_buffer = None
+                delta_message = self._extract_tool_calls_from_text(buffered_text)
+            else:
+                # Still accumulating; emit nothing for this chunk.
+                self.previous_text = current_text
+                self.previous_token_ids = current_token_ids
+                return None
+        elif not self.reasoning_is_done and self.reasoning_parser:
            delta_message = self.reasoning_parser.extract_reasoning_streaming(
                self.previous_text,
                current_text,
@@ -328,68 +451,96 @@ class StreamingPostProcessor:
                delta_token_ids,
            )
-        should_parse_tools = (
+            # When reasoning ends in this chunk, reset accumulated state.
-            self.tool_parser is not None
+            # If there is post-reasoning content (e.g. <tool_call> markup),
-            and self.request_for_sampling.tool_choice != "none"
+            # buffer it for non-streaming extraction rather than feeding it
-        )
+            # to the streaming tool parser which cannot handle the combined
-        if should_parse_tools:
+            # reasoning-end + tool-start in a single chunk.
-            no_prev_reasoning = (
+            if self.reasoning_parser.is_reasoning_end_streaming(
-                delta_message and delta_message.content and not delta_message.reasoning
+                current_token_ids, delta_token_ids
-            )
+            ):
-            if self.reasoning_is_done or no_prev_reasoning:
+                self.reasoning_is_done = True
-                delta_message = self.tool_parser.extract_tool_calls_streaming(
+                saved_reasoning = delta_message.reasoning if delta_message else None
-                    previous_text=self.previous_text,
+                post_content = (delta_message.content if delta_message else None) or ""
+                self.previous_text = ""
+                self.previous_token_ids = []
+                current_text = ""
+                current_token_ids = []
+                tool_call_start = getattr(
+                    self.tool_parser, "tool_call_start_token", None
+                )
+                if post_content and tool_call_start and tool_call_start in post_content:
+                    # Tool call markup present — buffer for non-streaming
+                    # extraction (streaming parser can't handle the combined
+                    # reasoning-end + tool-start in a single chunk).
+                    self._tool_text_buffer = post_content
+                    if output.finish_reason:
+                        # If finish_reason is already set, this is the final
+                        # chunk; parse buffered text now instead of waiting for
+                        # a later call that will never happen.
+                        buffered_text = self._tool_text_buffer
+                        self._tool_text_buffer = None
+                        delta_message = self._extract_tool_calls_from_text(
+                            buffered_text,
+                            saved_reasoning=saved_reasoning,
+                        )
+                    else:
+                        delta_message = self._compose_delta_message(
+                            saved_reasoning,
+                            None,
+                        )
+                else:
+                    # Plain content (or no content) after reasoning end.
+                    delta_message = self._compose_delta_message(
+                        reasoning=saved_reasoning,
+                        content=post_content if post_content else None,
+                    )
+            elif (
+                delta_message
+                and delta_message.content
+                and not delta_message.reasoning
+                and self._should_parse_tools()
+            ):
+                # Reasoning parser returned content (not reasoning).
+                # The model may have skipped reasoning and gone straight
+                # to tool calls (e.g. Mistral [TOOL_CALLS] without
+                # [THINK]...[/THINK]).  Let the tool parser decide.
+                delta_message = self._extract_tool_calls_streaming(
                    current_text=current_text,
                    delta_text=delta_text,
-                    previous_token_ids=self.previous_token_ids,
                    current_token_ids=current_token_ids,
                    delta_token_ids=delta_token_ids,
-                    request=self.request_for_sampling,
                )
+        else:
-        if (
+            if self._should_parse_tools():
-            not self.reasoning_is_done
+                no_prev_reasoning = (
-            and self.reasoning_parser
+                    delta_message
-            and self.reasoning_parser.is_reasoning_end_streaming(
+                    and delta_message.content
-                current_token_ids, delta_token_ids
+                    and not delta_message.reasoning
-            )
+                )
-        ):
+                if self.reasoning_is_done or no_prev_reasoning:
-            self.reasoning_is_done = True
+                    delta_message = self._extract_tool_calls_streaming(
-            self.previous_text = ""
+                        current_text=current_text,
-            self.previous_token_ids = []
+                        delta_text=delta_text,
-            current_text = ""
+                        current_token_ids=current_token_ids,
-            current_token_ids = []
+                        delta_token_ids=delta_token_ids,
+                    )
        choice = None
        if delta_message is None:
            if self.in_progress_tool_calls:
-                choice = {
+                choice = self._emit_tool_calls_choice(output)
-                    "index": output.index,
-                    "delta": {
-                        "role": "assistant",
-                        "tool_calls": [
-                            tool_call.model_dump(exclude_none=True)
-                            for _, tool_call in sorted(
-                                self.in_progress_tool_calls.items()
-                            )
-                        ],
-                    },
-                    "finish_reason": output.finish_reason,
-                    "logprobs": output.logprobs,
-                }
-                self.in_progress_tool_calls.clear()
            elif output.finish_reason:
-                choice = {
+                choice = self._build_choice(output, {})
-                    "index": output.index,
-                    "delta": {},
-                    "finish_reason": output.finish_reason,
-                    "logprobs": output.logprobs,
-                }
        elif delta_message.tool_calls:
-            for tool_delta in delta_message.tool_calls:
+            self._merge_streaming_tool_calls(delta_message.tool_calls)
-                existing = self.in_progress_tool_calls.get(tool_delta.index)
+            if output.finish_reason and self.in_progress_tool_calls:
-                merged = self._merge_tool_call(existing, tool_delta)
+                # Tool calls and finish_reason arrived in the same chunk.
-                self.in_progress_tool_calls[tool_delta.index] = merged
+                # Emit now — there will be no subsequent process_output call
+                # to drain the buffer.
+                choice = self._emit_tool_calls_choice(output)
        elif delta_message.content or delta_message.reasoning:
            delta: dict[str, Any] = {"role": "assistant"}
            content = delta_message.content
@@ -400,39 +551,14 @@ class StreamingPostProcessor:
            if delta_message.reasoning:
                delta["reasoning_content"] = delta_message.reasoning
            if self.in_progress_tool_calls:
-                delta["tool_calls"] = [
+                delta["tool_calls"] = self._dump_in_progress_tool_calls()
-                    tool_call.model_dump(exclude_none=True)
-                    for _, tool_call in sorted(self.in_progress_tool_calls.items())
-                ]
                self.in_progress_tool_calls.clear()
            if len(delta) > 1:
-                choice = {
+                choice = self._build_choice(output, delta)
-                    "index": output.index,
-                    "delta": delta,
-                    "finish_reason": output.finish_reason,
-                    "logprobs": output.logprobs,
-                }
        elif self.in_progress_tool_calls:
-            choice = {
+            choice = self._emit_tool_calls_choice(output)
-                "index": output.index,
-                "delta": {
-                    "role": "assistant",
-                    "tool_calls": [
-                        tool_call.model_dump(exclude_none=True)
-                        for _, tool_call in sorted(self.in_progress_tool_calls.items())
-                    ],
-                },
-                "finish_reason": output.finish_reason,
-                "logprobs": output.logprobs,
-            }
-            self.in_progress_tool_calls.clear()
        elif output.finish_reason:
-            choice = {
+            choice = self._build_choice(output, {})
-                "index": output.index,
-                "delta": {},
-                "finish_reason": output.finish_reason,
-                "logprobs": output.logprobs,
-            }
        self.previous_text = current_text
        self.previous_token_ids = current_token_ids

--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
@@ -114,6 +114,7 @@ def _init_worker(
 ) -> None:
    """Initialize a worker process with its own VllmConfig and InputProcessor."""
    global _w_input_processor, _w_tokenizer, _w_tool_parser_class
+    global _w_reasoning_parser_class
    model_config = ModelConfig(
        model=model_path,

--- a/container/deps/requirements.test.txt
+++ b/container/deps/requirements.test.txt
@@ -18,6 +18,7 @@ kr8s==0.20.13
 kubernetes==32.0.1
 kubernetes_asyncio==32.0.0
 matplotlib==3.10.7
+mistral-common==1.9.1
 # For NATS object store verification in router tests
 nats-py==2.12.0
 pmdarima==2.1.1

--- a/tests/frontend/__init__.py
+++ b/tests/frontend/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
--- a/tests/frontend/common.py
+++ b/tests/frontend/common.py
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+import importlib
+def check_module_available(module_name: str) -> bool:
+    """For tests / pre-commit"""
+    if importlib.util.find_spec(module_name) is None:
+        return False
+    try:
+        importlib.import_module(module_name)
+        return True
+    except ImportError:
+        return False
--- a/tests/frontend/test_prepost.py
+++ b/tests/frontend/test_prepost.py
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+"""Unit test for StreamingPostProcessor with Qwen3 reasoning + Hermes tool calling."""
+# mypy seems to be running both sides of the HAS_VLLM if statement
+# mypy: ignore-errors
+import json
+import pytest
+from .common import check_module_available
+HAS_VLLM = check_module_available("vllm")
+if HAS_VLLM:
+    from vllm.entrypoints.openai.chat_completion.protocol import (
+        ChatCompletionRequest,
+        ChatCompletionToolsParam,
+    )
+    from vllm.entrypoints.openai.engine.protocol import FunctionDefinition
+    from vllm.outputs import CompletionOutput
+    from vllm.reasoning.qwen3_reasoning_parser import Qwen3ReasoningParser
+    from vllm.sampling_params import SamplingParams
+    from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
+    from dynamo.frontend.prepost import StreamingPostProcessor
+else:
+    # Fake some types so that `pre-commit` passes
+    class CompletionOutput:
+        def __init__(*args, **kwargs):
+            pass
+pytestmark = [
+    pytest.mark.vllm,
+    pytest.mark.gpu_0,  # "Hardware"
+    pytest.mark.pre_merge,  # "Lifecyle"
+    pytest.mark.unit,  # "Test Type"
+    pytest.mark.skipif(not HAS_VLLM, reason="requires vllm"),
+]
+# ---------------------------------------------------------------------------
+# Mock tokenizer mimicking CachedQwen2TokenizerFast for Qwen3-0.6B
+# ---------------------------------------------------------------------------
+class MockQwen3Tokenizer:
+    """Minimal tokenizer mock with the tokens needed for this test."""
+    def __init__(self):
+        self._vocab = {
+            "<|endoftext|>": 151643,
+            "<|im_start|>": 151644,
+            "<|im_end|>": 151645,
+            "<|object_ref_start|>": 151646,
+            "<|object_ref_end|>": 151647,
+            "<|box_start|>": 151648,
+            "<|box_end|>": 151649,
+            "<|quad_start|>": 151650,
+            "<|quad_end|>": 151651,
+            "<|vision_start|>": 151652,
+            "<|vision_end|>": 151653,
+            "<|vision_pad|>": 151654,
+            "<|image_pad|>": 151655,
+            "<|video_pad|>": 151656,
+            "<tool_call>": 151657,
+            "</tool_call>": 151658,
+            "<tool_response>": 151665,
+            "</tool_response>": 151666,
+            "<think>": 151667,
+            "</think>": 151668,
+        }
+        self._id_to_token = {v: k for k, v in self._vocab.items()}
+        self.all_special_tokens = [
+            "<|endoftext|>",
+            "<|im_start|>",
+            "<|im_end|>",
+            "<|object_ref_start|>",
+            "<|object_ref_end|>",
+            "<|box_start|>",
+            "<|box_end|>",
+            "<|quad_start|>",
+            "<|quad_end|>",
+            "<|vision_start|>",
+            "<|vision_end|>",
+            "<|vision_pad|>",
+            "<|image_pad|>",
+            "<|video_pad|>",
+        ]
+    def get_vocab(self):
+        return dict(self._vocab)
+    def encode(self, text, add_special_tokens=False):
+        if text in self._vocab:
+            return [self._vocab[text]]
+        raise ValueError(f"Cannot encode unknown text: {text!r}")
+    def decode(self, token_ids):
+        return "".join(self._id_to_token.get(tid, f"<unk:{tid}>") for tid in token_ids)
+# ---------------------------------------------------------------------------
+# Test data: stream_interval=1 (one token per output)
+# ---------------------------------------------------------------------------
+OUTPUTS_INTERVAL_1 = [
+    CompletionOutput(
+        index=0,
+        text="<think>",
+        token_ids=[151667],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="\n",
+        token_ids=[198],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="Okay",
+        token_ids=[32313],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=",",
+        token_ids=[11],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" the",
+        token_ids=[279],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" user",
+        token_ids=[1196],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" is",
+        token_ids=[374],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" asking",
+        token_ids=[10161],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" for",
+        token_ids=[369],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" the",
+        token_ids=[279],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" titles",
+        token_ids=[15311],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" of",
+        token_ids=[315],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" some",
+        token_ids=[1045],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" James",
+        token_ids=[7801],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" Joyce",
+        token_ids=[53626],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" books",
+        token_ids=[6467],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" and",
+        token_ids=[323],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" wants",
+        token_ids=[6801],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" me",
+        token_ids=[752],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" to",
+        token_ids=[311],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" use",
+        token_ids=[990],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" the",
+        token_ids=[279],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" provided",
+        token_ids=[3897],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" tool",
+        token_ids=[5392],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=".\n",
+        token_ids=[624],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="</think>",
+        token_ids=[151668],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="\n\n",
+        token_ids=[271],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="<tool_call>",
+        token_ids=[151657],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="\n",
+        token_ids=[198],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='{"',
+        token_ids=[4913],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="name",
+        token_ids=[606],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='":',
+        token_ids=[788],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' "',
+        token_ids=[330],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="search",
+        token_ids=[1836],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="_g",
+        token_ids=[1889],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="utenberg",
+        token_ids=[44433],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="_books",
+        token_ids=[73084],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='",',
+        token_ids=[497],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' "',
+        token_ids=[330],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="arguments",
+        token_ids=[16370],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='":',
+        token_ids=[788],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' {"',
+        token_ids=[5212],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="search",
+        token_ids=[1836],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="_terms",
+        token_ids=[37498],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='":',
+        token_ids=[788],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' ["',
+        token_ids=[4383],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="James",
+        token_ids=[28084],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" Joyce",
+        token_ids=[53626],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='",',
+        token_ids=[497],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' "',
+        token_ids=[330],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="Project",
+        token_ids=[7849],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" Gutenberg",
+        token_ids=[51586],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='"]',
+        token_ids=[1341],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="}}\n",
+        token_ids=[11248],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="</tool_call>",
+        token_ids=[151658],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="",
+        token_ids=[151645],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    ),
+]
+# ---------------------------------------------------------------------------
+# Test data: stream_interval=20 (multiple tokens per output)
+# The critical difference: </think>, \n\n, <tool_call>, and the start of the
+# JSON tool-call body can all arrive in a single CompletionOutput chunk.
+# ---------------------------------------------------------------------------
+OUTPUTS_INTERVAL_20 = [
+    CompletionOutput(
+        index=0,
+        text="<think>",
+        token_ids=[151667],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="\nOkay, the user is asking for the titles of some James Joyce books and wants me to use",
+        token_ids=[
+            198,
+            32313,
+            11,
+            279,
+            1196,
+            374,
+            10161,
+            369,
+            279,
+            15311,
+            315,
+            1045,
+            7801,
+            53626,
+            6467,
+            323,
+            6801,
+            752,
+            311,
+            990,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" the provided tool. Let me check the available functions. There's a search_gutenberg_books function that",
+        token_ids=[
+            279,
+            3897,
+            5392,
+            13,
+            6771,
+            752,
+            1779,
+            279,
+            2500,
+            5746,
+            13,
+            2619,
+            594,
+            264,
+            2711,
+            1889,
+            44433,
+            73084,
+            729,
+            429,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' takes an array of search terms. The user mentioned "James Joyce books," so I need to use',
+        token_ids=[
+            4990,
+            458,
+            1334,
+            315,
+            2711,
+            3793,
+            13,
+            576,
+            1196,
+            9733,
+            330,
+            28084,
+            53626,
+            6467,
+            1335,
+            773,
+            358,
+            1184,
+            311,
+            990,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" the search terms related to that. I should make sure to list the relevant terms. Let me think",
+        token_ids=[
+            279,
+            2711,
+            3793,
+            5435,
+            311,
+            429,
+            13,
+            358,
+            1265,
+            1281,
+            2704,
+            311,
+            1140,
+            279,
+            9760,
+            3793,
+            13,
+            6771,
+            752,
+            1744,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='... "James Joyce" and "Project Gutenberg" might be the keywords here. So I\'ll structure',
+        token_ids=[
+            1112,
+            330,
+            28084,
+            53626,
+            1,
+            323,
+            330,
+            7849,
+            51586,
+            1,
+            2578,
+            387,
+            279,
+            20844,
+            1588,
+            13,
+            2055,
+            358,
+            3278,
+            5944,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' the search terms as ["James Joyce", "Project Gutenberg"] to find the books. That should cover',
+        token_ids=[
+            279,
+            2711,
+            3793,
+            438,
+            4383,
+            28084,
+            53626,
+            497,
+            330,
+            7849,
+            51586,
+            1341,
+            311,
+            1477,
+            279,
+            6467,
+            13,
+            2938,
+            1265,
+            3421,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' the user\'s request.\n</think>\n\n<tool_call>\n{"name": "search_gutenberg_books", "arguments',
+        token_ids=[
+            279,
+            1196,
+            594,
+            1681,
+            624,
+            151668,
+            271,
+            151657,
+            198,
+            4913,
+            606,
+            788,
+            330,
+            1836,
+            1889,
+            44433,
+            73084,
+            497,
+            330,
+            16370,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='": {"search_terms": ["James Joyce", "Project Gutenberg"]}}\n</tool_call>',
+        token_ids=[
+            788,
+            5212,
+            1836,
+            37498,
+            788,
+            4383,
+            28084,
+            53626,
+            497,
+            330,
+            7849,
+            51586,
+            1341,
+            11248,
+            151658,
+            151645,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    ),
+]
+# ---------------------------------------------------------------------------
+# Test data: stream_interval=20, reasoning + plain content (no tool calls).
+# The critical difference from OUTPUTS_INTERVAL_20: the last chunk contains
+# </think>, the response content, AND finish_reason=stop all in one
+# CompletionOutput.  There is no <tool_call> markup at all.
+# ---------------------------------------------------------------------------
+OUTPUTS_NO_TOOL_CALL = [
+    CompletionOutput(
+        index=0,
+        text="<think>",
+        token_ids=[151667],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="\nOkay, I need to find out the capital of Tuvalu. Let me start by recalling what",
+        token_ids=[
+            198,
+            32313,
+            11,
+            358,
+            1184,
+            311,
+            1477,
+            700,
+            279,
+            6722,
+            315,
+            28649,
+            25510,
+            13,
+            6771,
+            752,
+            1191,
+            553,
+            88646,
+            1128,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" I know. Tuvalu is a small island nation in the Pacific Ocean. I remember studying geography in",
+        token_ids=[
+            358,
+            1414,
+            13,
+            28649,
+            25510,
+            374,
+            264,
+            2613,
+            12922,
+            6995,
+            304,
+            279,
+            16462,
+            21575,
+            13,
+            358,
+            6099,
+            20956,
+            53142,
+            304,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" school, so probably there's some information there.\n\nWait, Tuvalu's capital is probably called H",
+        token_ids=[
+            2906,
+            11,
+            773,
+            4658,
+            1052,
+            594,
+            1045,
+            1995,
+            1052,
+            382,
+            14190,
+            11,
+            28649,
+            25510,
+            594,
+            6722,
+            374,
+            4658,
+            2598,
+            472,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="aka at the bottom of the list. But let me think again. When I was learning about islands",
+        token_ids=[
+            13334,
+            518,
+            279,
+            5622,
+            315,
+            279,
+            1140,
+            13,
+            1988,
+            1077,
+            752,
+            1744,
+            1549,
+            13,
+            3197,
+            358,
+            572,
+            6832,
+            911,
+            29000,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=", I remember that some countries have capital cities named after animals or other things. Haka sounds familiar",
+        token_ids=[
+            11,
+            358,
+            6099,
+            429,
+            1045,
+            5837,
+            614,
+            6722,
+            9720,
+            6941,
+            1283,
+            9898,
+            476,
+            1008,
+            2513,
+            13,
+            472,
+            13334,
+            10362,
+            11285,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' from some pictures or maybe the name "Haka" relates to the island. \n\nI should check',
+        token_ids=[
+            504,
+            1045,
+            9185,
+            476,
+            7196,
+            279,
+            829,
+            330,
+            39,
+            13334,
+            1,
+            35616,
+            311,
+            279,
+            12922,
+            13,
+            4710,
+            40,
+            1265,
+            1779,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" if there's another name for the capital. Maybe there's another city too. But looking at the",
+        token_ids=[
+            421,
+            1052,
+            594,
+            2441,
+            829,
+            369,
+            279,
+            6722,
+            13,
+            10696,
+            1052,
+            594,
+            2441,
+            3283,
+            2238,
+            13,
+            1988,
+            3330,
+            518,
+            279,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" options, the capital is definitely Haka. I don't think there's another one like that.",
+        token_ids=[
+            2606,
+            11,
+            279,
+            6722,
+            374,
+            8491,
+            472,
+            13334,
+            13,
+            358,
+            1513,
+            944,
+            1744,
+            1052,
+            594,
+            2441,
+            825,
+            1075,
+            429,
+            13,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" Let me make sure there's no other possible answer in the list that I'm missing. The user",
+        token_ids=[
+            6771,
+            752,
+            1281,
+            2704,
+            1052,
+            594,
+            902,
+            1008,
+            3204,
+            4226,
+            304,
+            279,
+            1140,
+            429,
+            358,
+            2776,
+            7402,
+            13,
+            576,
+            1196,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" provided the options, and the correct one is Haka. So I'm confident that's it.\n",
+        token_ids=[
+            3897,
+            279,
+            2606,
+            11,
+            323,
+            279,
+            4396,
+            825,
+            374,
+            472,
+            13334,
+            13,
+            2055,
+            358,
+            2776,
+            16506,
+            429,
+            594,
+            432,
+            624,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="</think>\n\nThe capital of Tuvalu is **Haka**.",
+        token_ids=[
+            151668,
+            271,
+            785,
+            6722,
+            315,
+            28649,
+            25510,
+            374,
+            3070,
+            39,
+            13334,
+            334,
+            13,
+            151645,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    ),
+]
+PROMPT_TOKEN_IDS = [
+    151644,
+    8948,
+    198,
+    2,
+    13852,
+    271,
+    2610,
+    1231,
+    1618,
+    825,
+    476,
+    803,
+    5746,
+    311,
+    7789,
+    448,
+    279,
+    1196,
+    3239,
+    382,
+    2610,
+    525,
+    3897,
+    448,
+    729,
+    32628,
+    2878,
+    366,
+    15918,
+    1472,
+    15918,
+    29,
+    11874,
+    9492,
+    510,
+    27,
+    15918,
+    397,
+    4913,
+    1313,
+    788,
+    330,
+    1688,
+    497,
+    330,
+    1688,
+    788,
+    5212,
+    606,
+    788,
+    330,
+    1836,
+    1889,
+    44433,
+    73084,
+    497,
+    330,
+    4684,
+    788,
+    330,
+    5890,
+    369,
+    6467,
+    304,
+    279,
+    5787,
+    51586,
+    6733,
+    497,
+    330,
+    13786,
+    788,
+    5212,
+    1313,
+    788,
+    330,
+    1700,
+    497,
+    330,
+    13193,
+    788,
+    5212,
+    1836,
+    37498,
+    788,
+    5212,
+    1313,
+    788,
+    330,
+    1653,
+    497,
+    330,
+    3615,
+    788,
+    5212,
+    1313,
+    788,
+    330,
+    917,
+    14345,
+    330,
+    4684,
+    788,
+    330,
+    852,
+    315,
+    2711,
+    3793,
+    311,
+    1477,
+    6467,
+    9207,
+    2137,
+    330,
+    6279,
+    788,
+    4383,
+    1836,
+    37498,
+    1341,
+    3417,
+    532,
+    522,
+    15918,
+    1339,
+    2461,
+    1817,
+    729,
+    1618,
+    11,
+    470,
+    264,
+    2951,
+    1633,
+    448,
+    729,
+    829,
+    323,
+    5977,
+    2878,
+    220,
+    151657,
+    151658,
+    11874,
+    9492,
+    510,
+    151657,
+    198,
+    4913,
+    606,
+    788,
+    366,
+    1688,
+    11494,
+    8066,
+    330,
+    16370,
+    788,
+    366,
+    2116,
+    56080,
+    40432,
+    31296,
+    151658,
+    151645,
+    198,
+    151644,
+    872,
+    198,
+    3838,
+    525,
+    279,
+    15311,
+    315,
+    1045,
+    7801,
+    53626,
+    6467,
+    30,
+    5443,
+    279,
+    5392,
+    311,
+    2711,
+    13,
+    151645,
+    198,
+    151644,
+    77091,
+    198,
+]
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+@pytest.fixture
+def tokenizer():
+    return MockQwen3Tokenizer()
+@pytest.fixture
+def request_for_sampling():
+    """Construct a ChatCompletionRequest matching the test spec."""
+    return ChatCompletionRequest.model_construct(
+        messages=[
+            {
+                "content": "What are the titles of some James Joyce books? "
+                "Use the tool to search.",
+                "role": "user",
+            }
+        ],
+        model="Qwen/Qwen3-0.6B",
+        tools=[
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="search_gutenberg_books",
+                    description="Search for books in the Project Gutenberg library",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "search_terms": {
+                                "type": "array",
+                                "items": {"type": "string"},
+                                "description": "List of search terms to find books",
+                            }
+                        },
+                        "required": ["search_terms"],
+                    },
+                ),
+            )
+        ],
+        tool_choice="auto",
+        include_reasoning=True,
+        stream=False,
+        n=1,
+        frequency_penalty=0.0,
+        presence_penalty=0.0,
+        temperature=None,
+        top_p=None,
+        skip_special_tokens=False,
+        chat_template_kwargs=None,
+        reasoning_effort=None,
+        parallel_tool_calls=True,
+    )
+@pytest.fixture
+def sampling_params():
+    return SamplingParams(
+        n=1,
+        presence_penalty=0.0,
+        frequency_penalty=0.0,
+        repetition_penalty=1.0,
+        temperature=0.6,
+        top_p=0.95,
+        top_k=20,
+        min_p=0.0,
+        seed=None,
+        stop=[],
+        stop_token_ids=[],
+        include_stop_str_in_output=False,
+        ignore_eos=False,
+        max_tokens=100000,
+        min_tokens=0,
+        logprobs=None,
+        prompt_logprobs=None,
+        skip_special_tokens=False,
+        spaces_between_special_tokens=True,
+        truncate_prompt_tokens=None,
+    )
+@pytest.fixture
+def processor(tokenizer, request_for_sampling, sampling_params):
+    tool_parser = Hermes2ProToolParser(tokenizer)
+    return StreamingPostProcessor(
+        tokenizer=tokenizer,
+        request_for_sampling=request_for_sampling,
+        sampling_params=sampling_params,
+        prompt_token_ids=PROMPT_TOKEN_IDS,
+        tool_parser=tool_parser,
+        reasoning_parser_class=Qwen3ReasoningParser,
+        chat_template_kwargs={"reasoning_effort": None},
+    )
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _collect_results(processor, outputs):
+    """Run all outputs through process_output and collect non-None results."""
+    results = []
+    for output in outputs:
+        result = processor.process_output(output)
+        if result is not None:
+            results.append(result)
+    return results
+def _collect_reasoning(results):
+    """Extract and join all reasoning_content from results."""
+    parts = []
+    for r in results:
+        rc = r.get("delta", {}).get("reasoning_content")
+        if rc is not None:
+            parts.append(rc)
+    return "".join(parts)
+def _collect_tool_calls(results):
+    """Merge all streamed tool_call deltas into complete tool calls.
+    Returns a list of dicts, each with 'id', 'type', 'function' (with 'name'
+    and 'arguments').
+    """
+    merged: dict[int, dict] = {}
+    for r in results:
+        tc_list = r.get("delta", {}).get("tool_calls")
+        if not tc_list:
+            continue
+        for tc in tc_list:
+            idx = tc["index"]
+            if idx not in merged:
+                merged[idx] = {
+                    "id": tc.get("id"),
+                    "type": tc.get("type"),
+                    "function": {
+                        "name": tc.get("function", {}).get("name"),
+                        "arguments": tc.get("function", {}).get("arguments", ""),
+                    },
+                }
+            else:
+                existing = merged[idx]
+                if tc.get("id") and not existing["id"]:
+                    existing["id"] = tc["id"]
+                if tc.get("type") and not existing["type"]:
+                    existing["type"] = tc["type"]
+                fn = tc.get("function", {})
+                if fn.get("name") and not existing["function"]["name"]:
+                    existing["function"]["name"] = fn["name"]
+                if fn.get("arguments"):
+                    existing["function"]["arguments"] += fn["arguments"]
+    return [merged[k] for k in sorted(merged)]
+# ---------------------------------------------------------------------------
+# Test
+# ---------------------------------------------------------------------------
+@pytest.mark.vllm
+def test_stream_interval_1(processor):
+    """stream_interval=1: one token per chunk. Baseline that works."""
+    results = _collect_results(processor, OUTPUTS_INTERVAL_1)
+    reasoning = _collect_reasoning(results)
+    tool_calls = _collect_tool_calls(results)
+    expected_reasoning = (
+        "\nOkay, the user is asking for the titles of some James Joyce"
+        " books and wants me to use the provided tool.\n"
+    )
+    assert reasoning == expected_reasoning
+    assert len(tool_calls) == 1
+    tc = tool_calls[0]
+    assert tc["function"]["name"] == "search_gutenberg_books"
+    assert json.loads(tc["function"]["arguments"]) == {
+        "search_terms": ["James Joyce", "Project Gutenberg"],
+    }
+    assert tc["id"] is not None and tc["id"].startswith("chatcmpl-tool-")
+    assert tc["type"] == "function"
+    finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
+    assert "stop" in finish_reasons
+    seen_content = False
+    for r in results:
+        delta = r.get("delta", {})
+        if delta.get("content") is not None:
+            seen_content = True
+        if seen_content:
+            assert (
+                delta.get("reasoning_content") is None
+            ), "reasoning_content appeared after regular content started"
+    for r in results:
+        delta = r.get("delta", {})
+        if delta:
+            assert delta.get("role") == "assistant"
+@pytest.mark.vllm
+def test_stream_interval_20(tokenizer, request_for_sampling, sampling_params):
+    """stream_interval=20: multiple tokens per chunk.
+    When </think>, <tool_call>, and the start of the JSON body arrive in a
+    single CompletionOutput, the tool parser must still extract the tool call
+    correctly instead of leaking raw tool-call markup into ``content``.
+    """
+    # Fresh processor — the tool parser is stateful.
+    tool_parser = Hermes2ProToolParser(tokenizer)
+    proc = StreamingPostProcessor(
+        tokenizer=tokenizer,
+        request_for_sampling=request_for_sampling,
+        sampling_params=sampling_params,
+        prompt_token_ids=PROMPT_TOKEN_IDS,
+        tool_parser=tool_parser,
+        reasoning_parser_class=Qwen3ReasoningParser,
+        chat_template_kwargs={"reasoning_effort": None},
+    )
+    results = _collect_results(proc, OUTPUTS_INTERVAL_20)
+    reasoning = _collect_reasoning(results)
+    tool_calls = _collect_tool_calls(results)
+    # -- reasoning_content should contain the full think block ---------------
+    assert "the user is asking for the titles of some James Joyce books" in reasoning
+    assert "the user's request.\n" in reasoning
+    # -- tool calls must be parsed, not leaked as content -------------------
+    assert len(tool_calls) == 1, (
+        f"Expected 1 tool call but got {len(tool_calls)}. "
+        "Tool-call markup was likely emitted as plain content instead."
+    )
+    tc = tool_calls[0]
+    assert tc["function"]["name"] == "search_gutenberg_books"
+    assert json.loads(tc["function"]["arguments"]) == {
+        "search_terms": ["James Joyce", "Project Gutenberg"],
+    }
+    assert tc["id"] is not None and tc["id"].startswith("chatcmpl-tool-")
+    assert tc["type"] == "function"
+    # -- no <tool_call> markup should appear in content ---------------------
+    all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
+    assert (
+        "<tool_call>" not in all_content
+    ), f"Raw <tool_call> markup leaked into content: {all_content!r}"
+    assert "</tool_call>" not in all_content
+    # -- finish reason ------------------------------------------------------
+    finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
+    assert "stop" in finish_reasons
+@pytest.mark.vllm
+def test_stream_interval_20_reasoning_and_tool_finish_same_chunk(
+    tokenizer, request_for_sampling, sampling_params
+):
+    """Regression: final chunk contains reasoning end + tool call + finish.
+    When </think>, <tool_call>... </tool_call>, and finish_reason=stop arrive
+    in one CompletionOutput, the tool call must still be emitted.
+    """
+    tool_parser = Hermes2ProToolParser(tokenizer)
+    proc = StreamingPostProcessor(
+        tokenizer=tokenizer,
+        request_for_sampling=request_for_sampling,
+        sampling_params=sampling_params,
+        prompt_token_ids=PROMPT_TOKEN_IDS,
+        tool_parser=tool_parser,
+        reasoning_parser_class=Qwen3ReasoningParser,
+        chat_template_kwargs={"reasoning_effort": None},
+    )
+    penultimate = OUTPUTS_INTERVAL_20[-2]
+    final = OUTPUTS_INTERVAL_20[-1]
+    merged_final = CompletionOutput(
+        index=0,
+        text=(penultimate.text or "") + (final.text or ""),
+        token_ids=list(penultimate.token_ids) + list(final.token_ids),
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    )
+    outputs = [*OUTPUTS_INTERVAL_20[:-2], merged_final]
+    results = _collect_results(proc, outputs)
+    reasoning = _collect_reasoning(results)
+    tool_calls = _collect_tool_calls(results)
+    assert "the user's request.\n" in reasoning
+    assert len(tool_calls) == 1
+    tc = tool_calls[0]
+    assert tc["function"]["name"] == "search_gutenberg_books"
+    assert json.loads(tc["function"]["arguments"]) == {
+        "search_terms": ["James Joyce", "Project Gutenberg"],
+    }
+    all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
+    assert "<tool_call>" not in all_content
+    assert "</tool_call>" not in all_content
+    finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
+    assert "stop" in finish_reasons
+@pytest.mark.vllm
+def test_stream_terminal_single_chunk(tokenizer, request_for_sampling, sampling_params):
+    """Regression: everything arrives in a single CompletionOutput.
+    The closing </think>, the full <tool_call>…</tool_call>, and
+    finish_reason="stop" are all packed into one chunk.  This exercises
+    the terminal single-chunk buffer-drain path in the post-processor.
+    """
+    tool_parser = Hermes2ProToolParser(tokenizer)
+    proc = StreamingPostProcessor(
+        tokenizer=tokenizer,
+        request_for_sampling=request_for_sampling,
+        sampling_params=sampling_params,
+        prompt_token_ids=PROMPT_TOKEN_IDS,
+        tool_parser=tool_parser,
+        reasoning_parser_class=Qwen3ReasoningParser,
+        chat_template_kwargs={"reasoning_effort": None},
+    )
+    # Build a single chunk that contains *all* text and token IDs from the
+    # OUTPUTS_INTERVAL_20 sequence, with finish_reason="stop".
+    all_text = "".join(o.text or "" for o in OUTPUTS_INTERVAL_20)
+    all_token_ids = [tid for o in OUTPUTS_INTERVAL_20 for tid in o.token_ids]
+    single_chunk = CompletionOutput(
+        index=0,
+        text=all_text,
+        token_ids=all_token_ids,
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    )
+    results = _collect_results(proc, [single_chunk])
+    reasoning = _collect_reasoning(results)
+    tool_calls = _collect_tool_calls(results)
+    # -- reasoning_content should contain the full think block ---------------
+    assert "the user is asking for the titles of some James Joyce books" in reasoning
+    assert "the user's request.\n" in reasoning
+    # -- tool calls must be parsed, not leaked as content -------------------
+    assert len(tool_calls) == 1, (
+        f"Expected 1 tool call but got {len(tool_calls)}. "
+        "Tool-call markup was likely emitted as plain content instead."
+    )
+    tc = tool_calls[0]
+    assert tc["function"]["name"] == "search_gutenberg_books"
+    assert json.loads(tc["function"]["arguments"]) == {
+        "search_terms": ["James Joyce", "Project Gutenberg"],
+    }
+    # -- no <tool_call> markup should appear in content ---------------------
+    all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
+    assert (
+        "<tool_call>" not in all_content
+    ), f"Raw <tool_call> markup leaked into content: {all_content!r}"
+    assert "</tool_call>" not in all_content
+    # -- finish reason ------------------------------------------------------
+    finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
+    assert "stop" in finish_reasons
+@pytest.mark.vllm
+def test_no_tool_call(tokenizer, request_for_sampling, sampling_params):
+    """Reasoning + plain content, no tool calls.
+    When </think> and the actual response content arrive in the same chunk
+    (with finish_reason=stop), the content must still be emitted.  This
+    reproduces a regression where the post-reasoning content was
+    unconditionally buffered for tool-call extraction and never emitted
+    when no tool call was present.
+    """
+    tool_parser = Hermes2ProToolParser(tokenizer)
+    proc = StreamingPostProcessor(
+        tokenizer=tokenizer,
+        request_for_sampling=request_for_sampling,
+        sampling_params=sampling_params,
+        prompt_token_ids=PROMPT_TOKEN_IDS,
+        tool_parser=tool_parser,
+        reasoning_parser_class=Qwen3ReasoningParser,
+        chat_template_kwargs={"reasoning_effort": None},
+    )
+    results = _collect_results(proc, OUTPUTS_NO_TOOL_CALL)
+    reasoning = _collect_reasoning(results)
+    # -- reasoning should contain the think block ----------------------------
+    assert "I need to find out the capital of Tuvalu" in reasoning
+    assert "confident that's it.\n" in reasoning
+    # -- content must include the actual response ----------------------------
+    all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
+    assert (
+        "The capital of Tuvalu is **Haka**." in all_content
+    ), f"Post-reasoning content was lost. Got content: {all_content!r}"
+    # -- no tool calls should be present ------------------------------------
+    tool_calls = _collect_tool_calls(results)
+    assert len(tool_calls) == 0, f"Expected 0 tool calls but got {len(tool_calls)}"
+    # -- finish reason ------------------------------------------------------
+    finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
+    assert "stop" in finish_reasons
--- a/tests/frontend/test_prepost_mistral.py
+++ b/tests/frontend/test_prepost_mistral.py
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+"""Unit test for StreamingPostProcessor with Mistral reasoning + tool calling."""
+# mypy seems to be running both sides of the HAS_VLLM if statement
+# mypy: ignore-errors
+import json
+import pytest
+from .common import check_module_available
+HAS_VLLM = check_module_available("vllm")
+if HAS_VLLM:
+    from mistral_common.tokens.tokenizers.base import SpecialTokens
+    from vllm.entrypoints.openai.chat_completion.protocol import (
+        ChatCompletionRequest,
+        ChatCompletionToolsParam,
+    )
+    from vllm.entrypoints.openai.engine.protocol import FunctionDefinition
+    from vllm.outputs import CompletionOutput
+    from vllm.reasoning.mistral_reasoning_parser import MistralReasoningParser
+    from vllm.sampling_params import SamplingParams
+    from vllm.tokenizers.mistral import MistralTokenizer
+    from vllm.tool_parsers.mistral_tool_parser import MistralToolParser
+    from dynamo.frontend.prepost import StreamingPostProcessor
+else:
+    # Fake some types so that `pre-commit` passes
+    class MistralTokenizer:
+        pass
+    class CompletionOutput:
+        def __init__(*args, **kwargs):
+            pass
+pytestmark = [
+    pytest.mark.vllm,
+    pytest.mark.gpu_0,  # "Hardware"
+    pytest.mark.pre_merge,  # "Lifecyle"
+    pytest.mark.unit,  # "Test Type"
+    pytest.mark.skipif(not HAS_VLLM, reason="requires vllm"),
+]
+# ---------------------------------------------------------------------------
+# Mock MistralTokenizer
+# ---------------------------------------------------------------------------
+# Token IDs from unit_test_4.txt
+TOOL_CALLS_TOKEN_ID = 9
+EOS_TOKEN_ID = 2
+BOS_TOKEN_ID = 1
+# Arbitrary IDs for think tokens (not present in this test's output, but
+# needed to initialise MistralReasoningParser).
+THINK_START_TOKEN_ID = 7
+THINK_END_TOKEN_ID = 8
+class _InnerTokenizer:
+    """Mimics the inner ``tokenizer.tokenizer`` accessed by MistralReasoningParser."""
+    def get_control_token(self, token):
+        return {
+            SpecialTokens.begin_think: THINK_START_TOKEN_ID,
+            SpecialTokens.end_think: THINK_END_TOKEN_ID,
+        }.get(token)
+class MockMistralTokenizer(MistralTokenizer):
+    """Lightweight MistralTokenizer subclass for testing.
+    Passes ``isinstance(tok, MistralTokenizer)`` without needing model files.
+    """
+    def __new__(cls):
+        # Bypass MistralTokenizer.__init__ (needs model artefacts).
+        return object.__new__(cls)
+    def __init__(self):
+        self.version = 11
+        self._vocab_dict = {"[TOOL_CALLS]": TOOL_CALLS_TOKEN_ID}
+        self.tokenizer = _InnerTokenizer()
+        self._special_tokens = ["[TOOL_CALLS]"]
+    def __bool__(self):
+        # Needed because MistralReasoningParser does ``if not self.model_tokenizer``
+        # which triggers __len__ → vocab_size on the real MistralTokenizer.
+        return True
+    def get_vocab(self):
+        return dict(self._vocab_dict)
+    @property
+    def all_special_tokens(self):
+        return self._special_tokens
+# ---------------------------------------------------------------------------
+# Test data from unit_test_4.txt (stream_interval=1, Mistral format)
+#
+# Output: [TOOL_CALLS]search_gutenberg_books{"search_terms": ["James Joyce"]}
+# No reasoning tokens at all — the model jumps straight to tool calls.
+# ---------------------------------------------------------------------------
+OUTPUTS_INTERVAL_1 = [
+    CompletionOutput(
+        index=0,
+        text="[TOOL_CALLS]",
+        token_ids=[9],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="search",
+        token_ids=[8928],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="_g",
+        token_ids=[11898],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="uten",
+        token_ids=[8318],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="berg",
+        token_ids=[6415],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="_",
+        token_ids=[1095],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="books",
+        token_ids=[32493],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="",
+        token_ids=[32],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='{"',
+        token_ids=[19227],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="search",
+        token_ids=[8928],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="_",
+        token_ids=[1095],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="terms",
+        token_ids=[62244],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='":',
+        token_ids=[2811],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=' ["',
+        token_ids=[12161],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="James",
+        token_ids=[31872],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text=" Joyce",
+        token_ids=[58617],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='"]',
+        token_ids=[4964],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="}",
+        token_ids=[1125],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text="",
+        token_ids=[2],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    ),
+]
+# ---------------------------------------------------------------------------
+# Test data from unit_test_5.txt (stream_interval=20, Mistral format)
+#
+# Only 2 chunks: [TOOL_CALLS] alone, then the entire function name + JSON
+# arguments + EOS in a single CompletionOutput with finish_reason=stop.
+# ---------------------------------------------------------------------------
+OUTPUTS_INTERVAL_20 = [
+    CompletionOutput(
+        index=0,
+        text="[TOOL_CALLS]",
+        token_ids=[9],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+        stop_reason=None,
+    ),
+    CompletionOutput(
+        index=0,
+        text='search_gutenberg_books{"search_terms": ["James Joyce books"]}',
+        token_ids=[
+            8928,
+            11898,
+            8318,
+            6415,
+            1095,
+            32493,
+            32,
+            19227,
+            8928,
+            1095,
+            62244,
+            2811,
+            12161,
+            31872,
+            58617,
+            12796,
+            4964,
+            1125,
+            2,
+        ],
+        routed_experts=None,
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    ),
+]
+PROMPT_TOKEN_IDS = [
+    1,
+    5,
+    1091,
+    19227,
+    4994,
+    2811,
+    1429,
+    5165,
+    1897,
+    1429,
+    5165,
+    2811,
+    16753,
+    2391,
+    2811,
+    1429,
+    8928,
+    11898,
+    8318,
+    6415,
+    1095,
+    32493,
+    1897,
+    1429,
+    14653,
+    2811,
+    1429,
+    8483,
+    1394,
+    12796,
+    1294,
+    1278,
+    13217,
+    111317,
+    6415,
+    11329,
+    1897,
+    1429,
+    26204,
+    2811,
+    16753,
+    4994,
+    2811,
+    1429,
+    6371,
+    1897,
+    1429,
+    48649,
+    2811,
+    16753,
+    8928,
+    1095,
+    62244,
+    2811,
+    16753,
+    4994,
+    2811,
+    1429,
+    5477,
+    1897,
+    1429,
+    11089,
+    2811,
+    16753,
+    4994,
+    2811,
+    1429,
+    3607,
+    50666,
+    1429,
+    14653,
+    2811,
+    1429,
+    2525,
+    1307,
+    6123,
+    6856,
+    1317,
+    3081,
+    12796,
+    1034,
+    47579,
+    1429,
+    15760,
+    2811,
+    12161,
+    8928,
+    1095,
+    62244,
+    4964,
+    2821,
+    27028,
+    6,
+    3,
+    7493,
+    1584,
+    1278,
+    26864,
+    1307,
+    2269,
+    7456,
+    58617,
+    12796,
+    1063,
+    13516,
+    1278,
+    9519,
+    1317,
+    6123,
+    1046,
+    4,
+]
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+@pytest.fixture
+def tokenizer():
+    return MockMistralTokenizer()
+@pytest.fixture
+def request_for_sampling():
+    """Construct a ChatCompletionRequest matching the Mistral test spec."""
+    return ChatCompletionRequest.model_construct(
+        messages=[
+            {
+                "content": "What are the titles of some James Joyce books? "
+                "Use the tool to search.",
+                "role": "user",
+            }
+        ],
+        model="mistralai/Ministral-3-3B-Reasoning-2512",
+        tools=[
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="search_gutenberg_books",
+                    description="Search for books in the Project Gutenberg library",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "search_terms": {
+                                "type": "array",
+                                "items": {"type": "string"},
+                                "description": "List of search terms to find books",
+                            }
+                        },
+                        "required": ["search_terms"],
+                    },
+                ),
+            )
+        ],
+        tool_choice="auto",
+        include_reasoning=True,
+        stream=False,
+        n=1,
+        frequency_penalty=0.0,
+        presence_penalty=0.0,
+        temperature=None,
+        top_p=None,
+        skip_special_tokens=True,
+        chat_template_kwargs=None,
+        reasoning_effort=None,
+        parallel_tool_calls=True,
+    )
+@pytest.fixture
+def sampling_params():
+    return SamplingParams(
+        n=1,
+        presence_penalty=0.0,
+        frequency_penalty=0.0,
+        repetition_penalty=1.0,
+        temperature=1.0,
+        top_p=1.0,
+        top_k=0,
+        min_p=0.0,
+        seed=None,
+        stop=[],
+        stop_token_ids=[],
+        include_stop_str_in_output=False,
+        ignore_eos=False,
+        max_tokens=100000,
+        min_tokens=0,
+        logprobs=None,
+        prompt_logprobs=None,
+        skip_special_tokens=True,
+        spaces_between_special_tokens=True,
+        truncate_prompt_tokens=None,
+    )
+@pytest.fixture
+def processor(tokenizer, request_for_sampling, sampling_params):
+    tool_parser = MistralToolParser(tokenizer)
+    return StreamingPostProcessor(
+        tokenizer=tokenizer,
+        request_for_sampling=request_for_sampling,
+        sampling_params=sampling_params,
+        prompt_token_ids=PROMPT_TOKEN_IDS,
+        tool_parser=tool_parser,
+        reasoning_parser_class=MistralReasoningParser,
+        chat_template_kwargs={"reasoning_effort": None},
+    )
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _collect_results(processor, outputs):
+    """Run all outputs through process_output and collect non-None results."""
+    results = []
+    for output in outputs:
+        result = processor.process_output(output)
+        if result is not None:
+            results.append(result)
+    return results
+def _collect_reasoning(results):
+    """Extract and join all reasoning_content from results."""
+    parts = []
+    for r in results:
+        rc = r.get("delta", {}).get("reasoning_content")
+        if rc is not None:
+            parts.append(rc)
+    return "".join(parts)
+def _collect_tool_calls(results):
+    """Merge all streamed tool_call deltas into complete tool calls."""
+    merged: dict[int, dict] = {}
+    for r in results:
+        tc_list = r.get("delta", {}).get("tool_calls")
+        if not tc_list:
+            continue
+        for tc in tc_list:
+            idx = tc["index"]
+            if idx not in merged:
+                merged[idx] = {
+                    "id": tc.get("id"),
+                    "type": tc.get("type"),
+                    "function": {
+                        "name": tc.get("function", {}).get("name"),
+                        "arguments": tc.get("function", {}).get("arguments", ""),
+                    },
+                }
+            else:
+                existing = merged[idx]
+                if tc.get("id") and not existing["id"]:
+                    existing["id"] = tc["id"]
+                if tc.get("type") and not existing["type"]:
+                    existing["type"] = tc["type"]
+                fn = tc.get("function", {})
+                if fn.get("name") and not existing["function"]["name"]:
+                    existing["function"]["name"] = fn["name"]
+                if fn.get("arguments"):
+                    existing["function"]["arguments"] += fn["arguments"]
+    return [merged[k] for k in sorted(merged)]
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+@pytest.mark.vllm
+def test_mistral_tool_call(processor):
+    """Mistral tool call with no reasoning.
+    The model output is:
+        [TOOL_CALLS]search_gutenberg_books{"search_terms": ["James Joyce"]}
+    with no [THINK]...[/THINK] reasoning block.
+    The tool parser should extract the tool call correctly, not leak the
+    tool-call markup as plain content.
+    """
+    results = _collect_results(processor, OUTPUTS_INTERVAL_1)
+    tool_calls = _collect_tool_calls(results)
+    # -- tool calls must be parsed correctly --------------------------------
+    assert len(tool_calls) == 1, (
+        f"Expected 1 tool call but got {len(tool_calls)}. "
+        "Tool-call markup was likely emitted as plain content."
+    )
+    tc = tool_calls[0]
+    assert tc["function"]["name"] == "search_gutenberg_books"
+    assert json.loads(tc["function"]["arguments"]) == {
+        "search_terms": ["James Joyce"],
+    }
+    assert tc["id"] is not None
+    assert tc["type"] == "function"
+    # -- no reasoning content should be present -----------------------------
+    reasoning = _collect_reasoning(results)
+    assert reasoning == "", f"Unexpected reasoning content: {reasoning!r}"
+    # -- [TOOL_CALLS] markup should not appear in content -------------------
+    all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
+    assert (
+        "[TOOL_CALLS]" not in all_content
+    ), f"Raw [TOOL_CALLS] markup leaked into content: {all_content!r}"
+    # -- finish reason ------------------------------------------------------
+    finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
+    assert "stop" in finish_reasons
+@pytest.mark.vllm
+def test_mistral_tool_call_interval_20(
+    tokenizer, request_for_sampling, sampling_params
+):
+    """stream_interval=20: function name + args + EOS in a single chunk.
+    Only 2 CompletionOutput objects:
+      1. [TOOL_CALLS] alone
+      2. search_gutenberg_books{"search_terms": ["James Joyce books"]}
+         with finish_reason=stop
+    The tool call and finish_reason arrive together.  The processor must
+    still emit the parsed tool call and the finish_reason.
+    """
+    tool_parser = MistralToolParser(tokenizer)
+    proc = StreamingPostProcessor(
+        tokenizer=tokenizer,
+        request_for_sampling=request_for_sampling,
+        sampling_params=sampling_params,
+        prompt_token_ids=PROMPT_TOKEN_IDS,
+        tool_parser=tool_parser,
+        reasoning_parser_class=MistralReasoningParser,
+        chat_template_kwargs={"reasoning_effort": None},
+    )
+    results = _collect_results(proc, OUTPUTS_INTERVAL_20)
+    tool_calls = _collect_tool_calls(results)
+    # -- tool calls must be parsed correctly --------------------------------
+    assert len(tool_calls) == 1, (
+        f"Expected 1 tool call but got {len(tool_calls)}. "
+        "Tool-call markup was likely emitted as plain content."
+    )
+    tc = tool_calls[0]
+    assert tc["function"]["name"] == "search_gutenberg_books"
+    assert json.loads(tc["function"]["arguments"]) == {
+        "search_terms": ["James Joyce books"],
+    }
+    assert tc["id"] is not None
+    assert tc["type"] == "function"
+    # -- no reasoning content should be present -----------------------------
+    reasoning = _collect_reasoning(results)
+    assert reasoning == "", f"Unexpected reasoning content: {reasoning!r}"
+    # -- [TOOL_CALLS] markup should not appear in content -------------------
+    all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
+    assert (
+        "[TOOL_CALLS]" not in all_content
+    ), f"Raw [TOOL_CALLS] markup leaked into content: {all_content!r}"
+    # -- finish reason ------------------------------------------------------
+    finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
+    assert "stop" in finish_reasons