# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit test for StreamingPostProcessor with Qwen3 reasoning + Hermes tool calling."""
# mypy seems to be running both sides of the HAS_VLLM if statement
# mypy: ignore-errors
import json
import pytest
from .common import check_module_available
HAS_VLLM = check_module_available("vllm")
if HAS_VLLM:
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
ChatCompletionToolsParam,
)
from vllm.entrypoints.openai.engine.protocol import FunctionDefinition
from vllm.outputs import CompletionOutput
from vllm.reasoning.qwen3_reasoning_parser import Qwen3ReasoningParser
from vllm.sampling_params import SamplingParams
from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
from dynamo.frontend.prepost import StreamingPostProcessor
else:
# Fake some types so that `pre-commit` passes
class CompletionOutput:
def __init__(*args, **kwargs):
pass
pytestmark = [
pytest.mark.vllm,
pytest.mark.gpu_0, # "Hardware"
pytest.mark.pre_merge, # "Lifecyle"
pytest.mark.unit, # "Test Type"
pytest.mark.skipif(not HAS_VLLM, reason="requires vllm"),
]
# ---------------------------------------------------------------------------
# Mock tokenizer mimicking CachedQwen2TokenizerFast for Qwen3-0.6B
# ---------------------------------------------------------------------------
class MockQwen3Tokenizer:
"""Minimal tokenizer mock with the tokens needed for this test."""
def __init__(self):
self._vocab = {
"<|endoftext|>": 151643,
"<|im_start|>": 151644,
"<|im_end|>": 151645,
"<|object_ref_start|>": 151646,
"<|object_ref_end|>": 151647,
"<|box_start|>": 151648,
"<|box_end|>": 151649,
"<|quad_start|>": 151650,
"<|quad_end|>": 151651,
"<|vision_start|>": 151652,
"<|vision_end|>": 151653,
"<|vision_pad|>": 151654,
"<|image_pad|>": 151655,
"<|video_pad|>": 151656,
"": 151657,
"": 151658,
"": 151665,
"": 151666,
"": 151667,
"": 151668,
}
self._id_to_token = {v: k for k, v in self._vocab.items()}
self.all_special_tokens = [
"<|endoftext|>",
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>",
]
def get_vocab(self):
return dict(self._vocab)
def encode(self, text, add_special_tokens=False):
if text in self._vocab:
return [self._vocab[text]]
raise ValueError(f"Cannot encode unknown text: {text!r}")
def decode(self, token_ids):
return "".join(self._id_to_token.get(tid, f"") for tid in token_ids)
# ---------------------------------------------------------------------------
# Test data: stream_interval=1 (one token per output)
# ---------------------------------------------------------------------------
OUTPUTS_INTERVAL_1 = [
CompletionOutput(
index=0,
text="",
token_ids=[151667],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="\n",
token_ids=[198],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="Okay",
token_ids=[32313],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=",",
token_ids=[11],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" the",
token_ids=[279],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" user",
token_ids=[1196],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" is",
token_ids=[374],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" asking",
token_ids=[10161],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" for",
token_ids=[369],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" the",
token_ids=[279],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" titles",
token_ids=[15311],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" of",
token_ids=[315],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" some",
token_ids=[1045],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" James",
token_ids=[7801],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" Joyce",
token_ids=[53626],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" books",
token_ids=[6467],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" and",
token_ids=[323],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" wants",
token_ids=[6801],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" me",
token_ids=[752],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" to",
token_ids=[311],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" use",
token_ids=[990],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" the",
token_ids=[279],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" provided",
token_ids=[3897],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" tool",
token_ids=[5392],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=".\n",
token_ids=[624],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="",
token_ids=[151668],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="\n\n",
token_ids=[271],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="",
token_ids=[151657],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="\n",
token_ids=[198],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='{"',
token_ids=[4913],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="name",
token_ids=[606],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='":',
token_ids=[788],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' "',
token_ids=[330],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="search",
token_ids=[1836],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="_g",
token_ids=[1889],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="utenberg",
token_ids=[44433],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="_books",
token_ids=[73084],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='",',
token_ids=[497],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' "',
token_ids=[330],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="arguments",
token_ids=[16370],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='":',
token_ids=[788],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' {"',
token_ids=[5212],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="search",
token_ids=[1836],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="_terms",
token_ids=[37498],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='":',
token_ids=[788],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' ["',
token_ids=[4383],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="James",
token_ids=[28084],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" Joyce",
token_ids=[53626],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='",',
token_ids=[497],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' "',
token_ids=[330],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="Project",
token_ids=[7849],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" Gutenberg",
token_ids=[51586],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='"]',
token_ids=[1341],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="}}\n",
token_ids=[11248],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="",
token_ids=[151658],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="",
token_ids=[151645],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason="stop",
stop_reason=None,
),
]
# ---------------------------------------------------------------------------
# Test data: stream_interval=20 (multiple tokens per output)
# The critical difference: , \n\n, , and the start of the
# JSON tool-call body can all arrive in a single CompletionOutput chunk.
# ---------------------------------------------------------------------------
OUTPUTS_INTERVAL_20 = [
CompletionOutput(
index=0,
text="",
token_ids=[151667],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="\nOkay, the user is asking for the titles of some James Joyce books and wants me to use",
token_ids=[
198,
32313,
11,
279,
1196,
374,
10161,
369,
279,
15311,
315,
1045,
7801,
53626,
6467,
323,
6801,
752,
311,
990,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" the provided tool. Let me check the available functions. There's a search_gutenberg_books function that",
token_ids=[
279,
3897,
5392,
13,
6771,
752,
1779,
279,
2500,
5746,
13,
2619,
594,
264,
2711,
1889,
44433,
73084,
729,
429,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' takes an array of search terms. The user mentioned "James Joyce books," so I need to use',
token_ids=[
4990,
458,
1334,
315,
2711,
3793,
13,
576,
1196,
9733,
330,
28084,
53626,
6467,
1335,
773,
358,
1184,
311,
990,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" the search terms related to that. I should make sure to list the relevant terms. Let me think",
token_ids=[
279,
2711,
3793,
5435,
311,
429,
13,
358,
1265,
1281,
2704,
311,
1140,
279,
9760,
3793,
13,
6771,
752,
1744,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='... "James Joyce" and "Project Gutenberg" might be the keywords here. So I\'ll structure',
token_ids=[
1112,
330,
28084,
53626,
1,
323,
330,
7849,
51586,
1,
2578,
387,
279,
20844,
1588,
13,
2055,
358,
3278,
5944,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' the search terms as ["James Joyce", "Project Gutenberg"] to find the books. That should cover',
token_ids=[
279,
2711,
3793,
438,
4383,
28084,
53626,
497,
330,
7849,
51586,
1341,
311,
1477,
279,
6467,
13,
2938,
1265,
3421,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' the user\'s request.\n\n\n\n{"name": "search_gutenberg_books", "arguments',
token_ids=[
279,
1196,
594,
1681,
624,
151668,
271,
151657,
198,
4913,
606,
788,
330,
1836,
1889,
44433,
73084,
497,
330,
16370,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text='": {"search_terms": ["James Joyce", "Project Gutenberg"]}}\n',
token_ids=[
788,
5212,
1836,
37498,
788,
4383,
28084,
53626,
497,
330,
7849,
51586,
1341,
11248,
151658,
151645,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason="stop",
stop_reason=None,
),
]
# ---------------------------------------------------------------------------
# Test data: stream_interval=20, reasoning + plain content (no tool calls).
# The critical difference from OUTPUTS_INTERVAL_20: the last chunk contains
# , the response content, AND finish_reason=stop all in one
# CompletionOutput. There is no markup at all.
# ---------------------------------------------------------------------------
OUTPUTS_NO_TOOL_CALL = [
CompletionOutput(
index=0,
text="",
token_ids=[151667],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="\nOkay, I need to find out the capital of Tuvalu. Let me start by recalling what",
token_ids=[
198,
32313,
11,
358,
1184,
311,
1477,
700,
279,
6722,
315,
28649,
25510,
13,
6771,
752,
1191,
553,
88646,
1128,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" I know. Tuvalu is a small island nation in the Pacific Ocean. I remember studying geography in",
token_ids=[
358,
1414,
13,
28649,
25510,
374,
264,
2613,
12922,
6995,
304,
279,
16462,
21575,
13,
358,
6099,
20956,
53142,
304,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" school, so probably there's some information there.\n\nWait, Tuvalu's capital is probably called H",
token_ids=[
2906,
11,
773,
4658,
1052,
594,
1045,
1995,
1052,
382,
14190,
11,
28649,
25510,
594,
6722,
374,
4658,
2598,
472,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="aka at the bottom of the list. But let me think again. When I was learning about islands",
token_ids=[
13334,
518,
279,
5622,
315,
279,
1140,
13,
1988,
1077,
752,
1744,
1549,
13,
3197,
358,
572,
6832,
911,
29000,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=", I remember that some countries have capital cities named after animals or other things. Haka sounds familiar",
token_ids=[
11,
358,
6099,
429,
1045,
5837,
614,
6722,
9720,
6941,
1283,
9898,
476,
1008,
2513,
13,
472,
13334,
10362,
11285,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=' from some pictures or maybe the name "Haka" relates to the island. \n\nI should check',
token_ids=[
504,
1045,
9185,
476,
7196,
279,
829,
330,
39,
13334,
1,
35616,
311,
279,
12922,
13,
4710,
40,
1265,
1779,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" if there's another name for the capital. Maybe there's another city too. But looking at the",
token_ids=[
421,
1052,
594,
2441,
829,
369,
279,
6722,
13,
10696,
1052,
594,
2441,
3283,
2238,
13,
1988,
3330,
518,
279,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" options, the capital is definitely Haka. I don't think there's another one like that.",
token_ids=[
2606,
11,
279,
6722,
374,
8491,
472,
13334,
13,
358,
1513,
944,
1744,
1052,
594,
2441,
825,
1075,
429,
13,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" Let me make sure there's no other possible answer in the list that I'm missing. The user",
token_ids=[
6771,
752,
1281,
2704,
1052,
594,
902,
1008,
3204,
4226,
304,
279,
1140,
429,
358,
2776,
7402,
13,
576,
1196,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text=" provided the options, and the correct one is Haka. So I'm confident that's it.\n",
token_ids=[
3897,
279,
2606,
11,
323,
279,
4396,
825,
374,
472,
13334,
13,
2055,
358,
2776,
16506,
429,
594,
432,
624,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
stop_reason=None,
),
CompletionOutput(
index=0,
text="\n\nThe capital of Tuvalu is **Haka**.",
token_ids=[
151668,
271,
785,
6722,
315,
28649,
25510,
374,
3070,
39,
13334,
334,
13,
151645,
],
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason="stop",
stop_reason=None,
),
]
PROMPT_TOKEN_IDS = [
151644,
8948,
198,
2,
13852,
271,
2610,
1231,
1618,
825,
476,
803,
5746,
311,
7789,
448,
279,
1196,
3239,
382,
2610,
525,
3897,
448,
729,
32628,
2878,
366,
15918,
1472,
15918,
29,
11874,
9492,
510,
27,
15918,
397,
4913,
1313,
788,
330,
1688,
497,
330,
1688,
788,
5212,
606,
788,
330,
1836,
1889,
44433,
73084,
497,
330,
4684,
788,
330,
5890,
369,
6467,
304,
279,
5787,
51586,
6733,
497,
330,
13786,
788,
5212,
1313,
788,
330,
1700,
497,
330,
13193,
788,
5212,
1836,
37498,
788,
5212,
1313,
788,
330,
1653,
497,
330,
3615,
788,
5212,
1313,
788,
330,
917,
14345,
330,
4684,
788,
330,
852,
315,
2711,
3793,
311,
1477,
6467,
9207,
2137,
330,
6279,
788,
4383,
1836,
37498,
1341,
3417,
532,
522,
15918,
1339,
2461,
1817,
729,
1618,
11,
470,
264,
2951,
1633,
448,
729,
829,
323,
5977,
2878,
220,
151657,
151658,
11874,
9492,
510,
151657,
198,
4913,
606,
788,
366,
1688,
11494,
8066,
330,
16370,
788,
366,
2116,
56080,
40432,
31296,
151658,
151645,
198,
151644,
872,
198,
3838,
525,
279,
15311,
315,
1045,
7801,
53626,
6467,
30,
5443,
279,
5392,
311,
2711,
13,
151645,
198,
151644,
77091,
198,
]
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def tokenizer():
return MockQwen3Tokenizer()
@pytest.fixture
def request_for_sampling():
"""Construct a ChatCompletionRequest matching the test spec."""
return ChatCompletionRequest.model_construct(
messages=[
{
"content": "What are the titles of some James Joyce books? "
"Use the tool to search.",
"role": "user",
}
],
model="Qwen/Qwen3-0.6B",
tools=[
ChatCompletionToolsParam(
type="function",
function=FunctionDefinition(
name="search_gutenberg_books",
description="Search for books in the Project Gutenberg library",
parameters={
"type": "object",
"properties": {
"search_terms": {
"type": "array",
"items": {"type": "string"},
"description": "List of search terms to find books",
}
},
"required": ["search_terms"],
},
),
)
],
tool_choice="auto",
include_reasoning=True,
stream=False,
n=1,
frequency_penalty=0.0,
presence_penalty=0.0,
temperature=None,
top_p=None,
skip_special_tokens=False,
chat_template_kwargs=None,
reasoning_effort=None,
parallel_tool_calls=True,
)
@pytest.fixture
def sampling_params():
return SamplingParams(
n=1,
presence_penalty=0.0,
frequency_penalty=0.0,
repetition_penalty=1.0,
temperature=0.6,
top_p=0.95,
top_k=20,
min_p=0.0,
seed=None,
stop=[],
stop_token_ids=[],
include_stop_str_in_output=False,
ignore_eos=False,
max_tokens=100000,
min_tokens=0,
logprobs=None,
prompt_logprobs=None,
skip_special_tokens=False,
spaces_between_special_tokens=True,
truncate_prompt_tokens=None,
)
@pytest.fixture
def processor(tokenizer, request_for_sampling, sampling_params):
tool_parser = Hermes2ProToolParser(tokenizer)
return StreamingPostProcessor(
tokenizer=tokenizer,
request_for_sampling=request_for_sampling,
sampling_params=sampling_params,
prompt_token_ids=PROMPT_TOKEN_IDS,
tool_parser=tool_parser,
reasoning_parser_class=Qwen3ReasoningParser,
chat_template_kwargs={"reasoning_effort": None},
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _collect_results(processor, outputs):
"""Run all outputs through process_output and collect non-None results."""
results = []
for output in outputs:
result = processor.process_output(output)
if result is not None:
results.append(result)
return results
def _collect_reasoning(results):
"""Extract and join all reasoning_content from results."""
parts = []
for r in results:
rc = r.get("delta", {}).get("reasoning_content")
if rc is not None:
parts.append(rc)
return "".join(parts)
def _collect_tool_calls(results):
"""Merge all streamed tool_call deltas into complete tool calls.
Returns a list of dicts, each with 'id', 'type', 'function' (with 'name'
and 'arguments').
"""
merged: dict[int, dict] = {}
for r in results:
tc_list = r.get("delta", {}).get("tool_calls")
if not tc_list:
continue
for tc in tc_list:
idx = tc["index"]
if idx not in merged:
merged[idx] = {
"id": tc.get("id"),
"type": tc.get("type"),
"function": {
"name": tc.get("function", {}).get("name"),
"arguments": tc.get("function", {}).get("arguments", ""),
},
}
else:
existing = merged[idx]
if tc.get("id") and not existing["id"]:
existing["id"] = tc["id"]
if tc.get("type") and not existing["type"]:
existing["type"] = tc["type"]
fn = tc.get("function", {})
if fn.get("name") and not existing["function"]["name"]:
existing["function"]["name"] = fn["name"]
if fn.get("arguments"):
existing["function"]["arguments"] += fn["arguments"]
return [merged[k] for k in sorted(merged)]
# ---------------------------------------------------------------------------
# Test
# ---------------------------------------------------------------------------
@pytest.mark.vllm
def test_stream_interval_1(processor):
"""stream_interval=1: one token per chunk. Baseline that works."""
results = _collect_results(processor, OUTPUTS_INTERVAL_1)
reasoning = _collect_reasoning(results)
tool_calls = _collect_tool_calls(results)
expected_reasoning = (
"\nOkay, the user is asking for the titles of some James Joyce"
" books and wants me to use the provided tool.\n"
)
assert reasoning == expected_reasoning
assert len(tool_calls) == 1
tc = tool_calls[0]
assert tc["function"]["name"] == "search_gutenberg_books"
assert json.loads(tc["function"]["arguments"]) == {
"search_terms": ["James Joyce", "Project Gutenberg"],
}
assert tc["id"] is not None and tc["id"].startswith("chatcmpl-tool-")
assert tc["type"] == "function"
finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
assert "stop" in finish_reasons
seen_content = False
for r in results:
delta = r.get("delta", {})
if delta.get("content") is not None:
seen_content = True
if seen_content:
assert (
delta.get("reasoning_content") is None
), "reasoning_content appeared after regular content started"
for r in results:
delta = r.get("delta", {})
if delta:
assert delta.get("role") == "assistant"
@pytest.mark.vllm
def test_stream_interval_20(tokenizer, request_for_sampling, sampling_params):
"""stream_interval=20: multiple tokens per chunk.
When , , and the start of the JSON body arrive in a
single CompletionOutput, the tool parser must still extract the tool call
correctly instead of leaking raw tool-call markup into ``content``.
"""
# Fresh processor — the tool parser is stateful.
tool_parser = Hermes2ProToolParser(tokenizer)
proc = StreamingPostProcessor(
tokenizer=tokenizer,
request_for_sampling=request_for_sampling,
sampling_params=sampling_params,
prompt_token_ids=PROMPT_TOKEN_IDS,
tool_parser=tool_parser,
reasoning_parser_class=Qwen3ReasoningParser,
chat_template_kwargs={"reasoning_effort": None},
)
results = _collect_results(proc, OUTPUTS_INTERVAL_20)
reasoning = _collect_reasoning(results)
tool_calls = _collect_tool_calls(results)
# -- reasoning_content should contain the full think block ---------------
assert "the user is asking for the titles of some James Joyce books" in reasoning
assert "the user's request.\n" in reasoning
# -- tool calls must be parsed, not leaked as content -------------------
assert len(tool_calls) == 1, (
f"Expected 1 tool call but got {len(tool_calls)}. "
"Tool-call markup was likely emitted as plain content instead."
)
tc = tool_calls[0]
assert tc["function"]["name"] == "search_gutenberg_books"
assert json.loads(tc["function"]["arguments"]) == {
"search_terms": ["James Joyce", "Project Gutenberg"],
}
assert tc["id"] is not None and tc["id"].startswith("chatcmpl-tool-")
assert tc["type"] == "function"
# -- no markup should appear in content ---------------------
all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
assert (
"" not in all_content
), f"Raw markup leaked into content: {all_content!r}"
assert "" not in all_content
# -- finish reason ------------------------------------------------------
finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
assert "stop" in finish_reasons
@pytest.mark.vllm
def test_stream_interval_20_reasoning_and_tool_finish_same_chunk(
tokenizer, request_for_sampling, sampling_params
):
"""Regression: final chunk contains reasoning end + tool call + finish.
When , ... , and finish_reason=stop arrive
in one CompletionOutput, the tool call must still be emitted.
"""
tool_parser = Hermes2ProToolParser(tokenizer)
proc = StreamingPostProcessor(
tokenizer=tokenizer,
request_for_sampling=request_for_sampling,
sampling_params=sampling_params,
prompt_token_ids=PROMPT_TOKEN_IDS,
tool_parser=tool_parser,
reasoning_parser_class=Qwen3ReasoningParser,
chat_template_kwargs={"reasoning_effort": None},
)
penultimate = OUTPUTS_INTERVAL_20[-2]
final = OUTPUTS_INTERVAL_20[-1]
merged_final = CompletionOutput(
index=0,
text=(penultimate.text or "") + (final.text or ""),
token_ids=list(penultimate.token_ids) + list(final.token_ids),
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason="stop",
stop_reason=None,
)
outputs = [*OUTPUTS_INTERVAL_20[:-2], merged_final]
results = _collect_results(proc, outputs)
reasoning = _collect_reasoning(results)
tool_calls = _collect_tool_calls(results)
assert "the user's request.\n" in reasoning
assert len(tool_calls) == 1
tc = tool_calls[0]
assert tc["function"]["name"] == "search_gutenberg_books"
assert json.loads(tc["function"]["arguments"]) == {
"search_terms": ["James Joyce", "Project Gutenberg"],
}
all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
assert "" not in all_content
assert "" not in all_content
finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
assert "stop" in finish_reasons
@pytest.mark.vllm
def test_stream_terminal_single_chunk(tokenizer, request_for_sampling, sampling_params):
"""Regression: everything arrives in a single CompletionOutput.
The closing , the full …, and
finish_reason="stop" are all packed into one chunk. This exercises
the terminal single-chunk buffer-drain path in the post-processor.
"""
tool_parser = Hermes2ProToolParser(tokenizer)
proc = StreamingPostProcessor(
tokenizer=tokenizer,
request_for_sampling=request_for_sampling,
sampling_params=sampling_params,
prompt_token_ids=PROMPT_TOKEN_IDS,
tool_parser=tool_parser,
reasoning_parser_class=Qwen3ReasoningParser,
chat_template_kwargs={"reasoning_effort": None},
)
# Build a single chunk that contains *all* text and token IDs from the
# OUTPUTS_INTERVAL_20 sequence, with finish_reason="stop".
all_text = "".join(o.text or "" for o in OUTPUTS_INTERVAL_20)
all_token_ids = [tid for o in OUTPUTS_INTERVAL_20 for tid in o.token_ids]
single_chunk = CompletionOutput(
index=0,
text=all_text,
token_ids=all_token_ids,
routed_experts=None,
cumulative_logprob=None,
logprobs=None,
finish_reason="stop",
stop_reason=None,
)
results = _collect_results(proc, [single_chunk])
reasoning = _collect_reasoning(results)
tool_calls = _collect_tool_calls(results)
# -- reasoning_content should contain the full think block ---------------
assert "the user is asking for the titles of some James Joyce books" in reasoning
assert "the user's request.\n" in reasoning
# -- tool calls must be parsed, not leaked as content -------------------
assert len(tool_calls) == 1, (
f"Expected 1 tool call but got {len(tool_calls)}. "
"Tool-call markup was likely emitted as plain content instead."
)
tc = tool_calls[0]
assert tc["function"]["name"] == "search_gutenberg_books"
assert json.loads(tc["function"]["arguments"]) == {
"search_terms": ["James Joyce", "Project Gutenberg"],
}
# -- no markup should appear in content ---------------------
all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
assert (
"" not in all_content
), f"Raw markup leaked into content: {all_content!r}"
assert "" not in all_content
# -- finish reason ------------------------------------------------------
finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
assert "stop" in finish_reasons
@pytest.mark.vllm
def test_no_tool_call(tokenizer, request_for_sampling, sampling_params):
"""Reasoning + plain content, no tool calls.
When and the actual response content arrive in the same chunk
(with finish_reason=stop), the content must still be emitted. This
reproduces a regression where the post-reasoning content was
unconditionally buffered for tool-call extraction and never emitted
when no tool call was present.
"""
tool_parser = Hermes2ProToolParser(tokenizer)
proc = StreamingPostProcessor(
tokenizer=tokenizer,
request_for_sampling=request_for_sampling,
sampling_params=sampling_params,
prompt_token_ids=PROMPT_TOKEN_IDS,
tool_parser=tool_parser,
reasoning_parser_class=Qwen3ReasoningParser,
chat_template_kwargs={"reasoning_effort": None},
)
results = _collect_results(proc, OUTPUTS_NO_TOOL_CALL)
reasoning = _collect_reasoning(results)
# -- reasoning should contain the think block ----------------------------
assert "I need to find out the capital of Tuvalu" in reasoning
assert "confident that's it.\n" in reasoning
# -- content must include the actual response ----------------------------
all_content = "".join(r.get("delta", {}).get("content", "") for r in results)
assert (
"The capital of Tuvalu is **Haka**." in all_content
), f"Post-reasoning content was lost. Got content: {all_content!r}"
# -- no tool calls should be present ------------------------------------
tool_calls = _collect_tool_calls(results)
assert len(tool_calls) == 0, f"Expected 0 tool calls but got {len(tool_calls)}"
# -- finish reason ------------------------------------------------------
finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")]
assert "stop" in finish_reasons