# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """Unit test for StreamingPostProcessor with Mistral reasoning + tool calling.""" # mypy seems to be running both sides of the HAS_VLLM if statement # mypy: ignore-errors import json import pytest from .common import check_module_available HAS_VLLM = check_module_available("vllm") if HAS_VLLM: from mistral_common.tokens.tokenizers.base import SpecialTokens from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, ) from vllm.entrypoints.openai.engine.protocol import FunctionDefinition from vllm.outputs import CompletionOutput from vllm.reasoning.mistral_reasoning_parser import MistralReasoningParser from vllm.sampling_params import SamplingParams from vllm.tokenizers.mistral import MistralTokenizer from vllm.tool_parsers.mistral_tool_parser import MistralToolParser from dynamo.frontend.prepost import StreamingPostProcessor else: # Fake some types so that `pre-commit` passes class MistralTokenizer: pass class CompletionOutput: def __init__(*args, **kwargs): pass pytestmark = [ pytest.mark.vllm, pytest.mark.gpu_0, # "Hardware" pytest.mark.pre_merge, # "Lifecyle" pytest.mark.unit, # "Test Type" pytest.mark.skipif(not HAS_VLLM, reason="requires vllm"), ] # --------------------------------------------------------------------------- # Mock MistralTokenizer # --------------------------------------------------------------------------- # Token IDs from unit_test_4.txt TOOL_CALLS_TOKEN_ID = 9 EOS_TOKEN_ID = 2 BOS_TOKEN_ID = 1 # Arbitrary IDs for think tokens (not present in this test's output, but # needed to initialise MistralReasoningParser). THINK_START_TOKEN_ID = 7 THINK_END_TOKEN_ID = 8 class _InnerTokenizer: """Mimics the inner ``tokenizer.tokenizer`` accessed by MistralReasoningParser.""" def get_control_token(self, token): return { SpecialTokens.begin_think: THINK_START_TOKEN_ID, SpecialTokens.end_think: THINK_END_TOKEN_ID, }.get(token) class MockMistralTokenizer(MistralTokenizer): """Lightweight MistralTokenizer subclass for testing. Passes ``isinstance(tok, MistralTokenizer)`` without needing model files. """ def __new__(cls): # Bypass MistralTokenizer.__init__ (needs model artefacts). return object.__new__(cls) def __init__(self): self.version = 11 self._vocab_dict = {"[TOOL_CALLS]": TOOL_CALLS_TOKEN_ID} self.tokenizer = _InnerTokenizer() self._special_tokens = ["[TOOL_CALLS]"] def __bool__(self): # Needed because MistralReasoningParser does ``if not self.model_tokenizer`` # which triggers __len__ → vocab_size on the real MistralTokenizer. return True def get_vocab(self): return dict(self._vocab_dict) @property def all_special_tokens(self): return self._special_tokens # --------------------------------------------------------------------------- # Test data from unit_test_4.txt (stream_interval=1, Mistral format) # # Output: [TOOL_CALLS]search_gutenberg_books{"search_terms": ["James Joyce"]} # No reasoning tokens at all — the model jumps straight to tool calls. # --------------------------------------------------------------------------- OUTPUTS_INTERVAL_1 = [ CompletionOutput( index=0, text="[TOOL_CALLS]", token_ids=[9], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="search", token_ids=[8928], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="_g", token_ids=[11898], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="uten", token_ids=[8318], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="berg", token_ids=[6415], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="_", token_ids=[1095], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="books", token_ids=[32493], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="", token_ids=[32], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text='{"', token_ids=[19227], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="search", token_ids=[8928], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="_", token_ids=[1095], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="terms", token_ids=[62244], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text='":', token_ids=[2811], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text=' ["', token_ids=[12161], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="James", token_ids=[31872], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text=" Joyce", token_ids=[58617], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text='"]', token_ids=[4964], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="}", token_ids=[1125], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text="", token_ids=[2], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason="stop", stop_reason=None, ), ] # --------------------------------------------------------------------------- # Test data from unit_test_5.txt (stream_interval=20, Mistral format) # # Only 2 chunks: [TOOL_CALLS] alone, then the entire function name + JSON # arguments + EOS in a single CompletionOutput with finish_reason=stop. # --------------------------------------------------------------------------- OUTPUTS_INTERVAL_20 = [ CompletionOutput( index=0, text="[TOOL_CALLS]", token_ids=[9], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason=None, stop_reason=None, ), CompletionOutput( index=0, text='search_gutenberg_books{"search_terms": ["James Joyce books"]}', token_ids=[ 8928, 11898, 8318, 6415, 1095, 32493, 32, 19227, 8928, 1095, 62244, 2811, 12161, 31872, 58617, 12796, 4964, 1125, 2, ], routed_experts=None, cumulative_logprob=None, logprobs=None, finish_reason="stop", stop_reason=None, ), ] PROMPT_TOKEN_IDS = [ 1, 5, 1091, 19227, 4994, 2811, 1429, 5165, 1897, 1429, 5165, 2811, 16753, 2391, 2811, 1429, 8928, 11898, 8318, 6415, 1095, 32493, 1897, 1429, 14653, 2811, 1429, 8483, 1394, 12796, 1294, 1278, 13217, 111317, 6415, 11329, 1897, 1429, 26204, 2811, 16753, 4994, 2811, 1429, 6371, 1897, 1429, 48649, 2811, 16753, 8928, 1095, 62244, 2811, 16753, 4994, 2811, 1429, 5477, 1897, 1429, 11089, 2811, 16753, 4994, 2811, 1429, 3607, 50666, 1429, 14653, 2811, 1429, 2525, 1307, 6123, 6856, 1317, 3081, 12796, 1034, 47579, 1429, 15760, 2811, 12161, 8928, 1095, 62244, 4964, 2821, 27028, 6, 3, 7493, 1584, 1278, 26864, 1307, 2269, 7456, 58617, 12796, 1063, 13516, 1278, 9519, 1317, 6123, 1046, 4, ] # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture def tokenizer(): return MockMistralTokenizer() @pytest.fixture def request_for_sampling(): """Construct a ChatCompletionRequest matching the Mistral test spec.""" return ChatCompletionRequest.model_construct( messages=[ { "content": "What are the titles of some James Joyce books? " "Use the tool to search.", "role": "user", } ], model="mistralai/Ministral-3-3B-Reasoning-2512", tools=[ ChatCompletionToolsParam( type="function", function=FunctionDefinition( name="search_gutenberg_books", description="Search for books in the Project Gutenberg library", parameters={ "type": "object", "properties": { "search_terms": { "type": "array", "items": {"type": "string"}, "description": "List of search terms to find books", } }, "required": ["search_terms"], }, ), ) ], tool_choice="auto", include_reasoning=True, stream=False, n=1, frequency_penalty=0.0, presence_penalty=0.0, temperature=None, top_p=None, skip_special_tokens=True, chat_template_kwargs=None, reasoning_effort=None, parallel_tool_calls=True, ) @pytest.fixture def sampling_params(): return SamplingParams( n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=100000, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, ) @pytest.fixture def processor(tokenizer, request_for_sampling, sampling_params): tool_parser = MistralToolParser(tokenizer) return StreamingPostProcessor( tokenizer=tokenizer, request_for_sampling=request_for_sampling, sampling_params=sampling_params, prompt_token_ids=PROMPT_TOKEN_IDS, tool_parser=tool_parser, reasoning_parser_class=MistralReasoningParser, chat_template_kwargs={"reasoning_effort": None}, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _collect_results(processor, outputs): """Run all outputs through process_output and collect non-None results.""" results = [] for output in outputs: result = processor.process_output(output) if result is not None: results.append(result) return results def _collect_reasoning(results): """Extract and join all reasoning_content from results.""" parts = [] for r in results: rc = r.get("delta", {}).get("reasoning_content") if rc is not None: parts.append(rc) return "".join(parts) def _collect_tool_calls(results): """Merge all streamed tool_call deltas into complete tool calls.""" merged: dict[int, dict] = {} for r in results: tc_list = r.get("delta", {}).get("tool_calls") if not tc_list: continue for tc in tc_list: idx = tc["index"] if idx not in merged: merged[idx] = { "id": tc.get("id"), "type": tc.get("type"), "function": { "name": tc.get("function", {}).get("name"), "arguments": tc.get("function", {}).get("arguments", ""), }, } else: existing = merged[idx] if tc.get("id") and not existing["id"]: existing["id"] = tc["id"] if tc.get("type") and not existing["type"]: existing["type"] = tc["type"] fn = tc.get("function", {}) if fn.get("name") and not existing["function"]["name"]: existing["function"]["name"] = fn["name"] if fn.get("arguments"): existing["function"]["arguments"] += fn["arguments"] return [merged[k] for k in sorted(merged)] # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- @pytest.mark.vllm def test_mistral_tool_call(processor): """Mistral tool call with no reasoning. The model output is: [TOOL_CALLS]search_gutenberg_books{"search_terms": ["James Joyce"]} with no [THINK]...[/THINK] reasoning block. The tool parser should extract the tool call correctly, not leak the tool-call markup as plain content. """ results = _collect_results(processor, OUTPUTS_INTERVAL_1) tool_calls = _collect_tool_calls(results) # -- tool calls must be parsed correctly -------------------------------- assert len(tool_calls) == 1, ( f"Expected 1 tool call but got {len(tool_calls)}. " "Tool-call markup was likely emitted as plain content." ) tc = tool_calls[0] assert tc["function"]["name"] == "search_gutenberg_books" assert json.loads(tc["function"]["arguments"]) == { "search_terms": ["James Joyce"], } assert tc["id"] is not None assert tc["type"] == "function" # -- no reasoning content should be present ----------------------------- reasoning = _collect_reasoning(results) assert reasoning == "", f"Unexpected reasoning content: {reasoning!r}" # -- [TOOL_CALLS] markup should not appear in content ------------------- all_content = "".join(r.get("delta", {}).get("content", "") for r in results) assert ( "[TOOL_CALLS]" not in all_content ), f"Raw [TOOL_CALLS] markup leaked into content: {all_content!r}" # -- finish reason ------------------------------------------------------ finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")] assert "stop" in finish_reasons @pytest.mark.vllm def test_mistral_tool_call_interval_20( tokenizer, request_for_sampling, sampling_params ): """stream_interval=20: function name + args + EOS in a single chunk. Only 2 CompletionOutput objects: 1. [TOOL_CALLS] alone 2. search_gutenberg_books{"search_terms": ["James Joyce books"]} with finish_reason=stop The tool call and finish_reason arrive together. The processor must still emit the parsed tool call and the finish_reason. """ tool_parser = MistralToolParser(tokenizer) proc = StreamingPostProcessor( tokenizer=tokenizer, request_for_sampling=request_for_sampling, sampling_params=sampling_params, prompt_token_ids=PROMPT_TOKEN_IDS, tool_parser=tool_parser, reasoning_parser_class=MistralReasoningParser, chat_template_kwargs={"reasoning_effort": None}, ) results = _collect_results(proc, OUTPUTS_INTERVAL_20) tool_calls = _collect_tool_calls(results) # -- tool calls must be parsed correctly -------------------------------- assert len(tool_calls) == 1, ( f"Expected 1 tool call but got {len(tool_calls)}. " "Tool-call markup was likely emitted as plain content." ) tc = tool_calls[0] assert tc["function"]["name"] == "search_gutenberg_books" assert json.loads(tc["function"]["arguments"]) == { "search_terms": ["James Joyce books"], } assert tc["id"] is not None assert tc["type"] == "function" # -- no reasoning content should be present ----------------------------- reasoning = _collect_reasoning(results) assert reasoning == "", f"Unexpected reasoning content: {reasoning!r}" # -- [TOOL_CALLS] markup should not appear in content ------------------- all_content = "".join(r.get("delta", {}).get("content", "") for r in results) assert ( "[TOOL_CALLS]" not in all_content ), f"Raw [TOOL_CALLS] markup leaked into content: {all_content!r}" # -- finish reason ------------------------------------------------------ finish_reasons = [r["finish_reason"] for r in results if r.get("finish_reason")] assert "stop" in finish_reasons