[bugfix] fix MiniMaxM2ReasoningParser streaming output not separating reasoning_content. (#29882)

Signed-off-by: Rei <1477174254@qq.com>

[bugfix] fix MiniMaxM2ReasoningParser streaming output not separating reasoning_content. (#29882)
Signed-off-by: Rei <1477174254@qq.com>
6299628d · Rei. · GitHub · fba89069 · 6299628d · 6299628d
Unverified Commit 6299628d authored Dec 11, 2025 by Rei. Committed by GitHub Dec 11, 2025
3 changed files
--- a/tests/reasoning/test_minimax_m2_append_reasoning_parser.py
+++ b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "minimax_m2_append_think"
+end_token = "</think>"
+
+# MiniMax M2 model path
+REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
+
+
+@pytest.fixture(scope="module")
+def minimax_m2_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+# =============================================================================
+# MiniMaxM2AppendThinkReasoningParser behavior:
+# - Prepends <think> to the beginning of the output
+# - Does NOT separate reasoning and content
+# - Returns everything as content (with <think> prepended)
+# - reasoning is always None
+#
+# This parser is used when you want to keep the raw output with <think> added
+# =============================================================================
+
+# Case: simple output with end token
+SIMPLE_OUTPUT = {
+    "output": "This is reasoning</think>This is response",
+    "reasoning": None,
+    "content": "<think>This is reasoning</think>This is response",
+    "is_reasoning_end": True,
+}
+
+# Case: output without end token (reasoning in progress)
+NO_END_TOKEN = {
+    "output": "This is reasoning in progress",
+    "reasoning": None,
+    "content": "<think>This is reasoning in progress",
+    "is_reasoning_end": False,
+}
+
+# Case: only end token
+ONLY_END_TOKEN = {
+    "output": "</think>This is response",
+    "reasoning": None,
+    "content": "<think></think>This is response",
+    "is_reasoning_end": True,
+}
+
+# Case: multiple lines
+MULTIPLE_LINES = {
+    "output": "Line 1\nLine 2</think>Response 1\nResponse 2",
+    "reasoning": None,
+    "content": "<think>Line 1\nLine 2</think>Response 1\nResponse 2",
+    "is_reasoning_end": True,
+}
+
+# Case: empty output (non-streaming prepends <think>)
+EMPTY = {
+    "output": "",
+    "reasoning": None,
+    "content": "<think>",
+    "is_reasoning_end": False,
+}
+
+# Case: empty output streaming (no tokens = no output)
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+# Case: special characters
+SPECIAL_CHARS = {
+    "output": "Let me think... 1+1=2</think>Yes!",
+    "reasoning": None,
+    "content": "<think>Let me think... 1+1=2</think>Yes!",
+    "is_reasoning_end": True,
+}
+
+# Case: code in output
+CODE_OUTPUT = {
+    "output": "```python\nprint('hi')\n```</think>Here's the code.",
+    "reasoning": None,
+    "content": "<think>```python\nprint('hi')\n```</think>Here's the code.",
+    "is_reasoning_end": True,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_OUTPUT,
+        id="simple_output",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_OUTPUT,
+        id="simple_output_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_END_TOKEN,
+        id="no_end_token",
+    ),
+    pytest.param(
+        True,
+        NO_END_TOKEN,
+        id="no_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        ONLY_END_TOKEN,
+        id="only_end_token",
+    ),
+    pytest.param(
+        True,
+        ONLY_END_TOKEN,
+        id="only_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        SPECIAL_CHARS,
+        id="special_chars",
+    ),
+    pytest.param(
+        True,
+        SPECIAL_CHARS,
+        id="special_chars_streaming",
+    ),
+    pytest.param(
+        False,
+        CODE_OUTPUT,
+        id="code_output",
+    ),
+    pytest.param(
+        True,
+        CODE_OUTPUT,
+        id="code_output_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    minimax_m2_tokenizer,
+):
+    output = minimax_m2_tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        minimax_m2_tokenizer
+    )
+
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
--- a/tests/reasoning/test_minimax_m2_reasoning_parser.py
+++ b/tests/reasoning/test_minimax_m2_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "minimax_m2"
+end_token = "</think>"
+
+# MiniMax M2 model path
+REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
+
+
+@pytest.fixture(scope="module")
+def minimax_m2_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+# =============================================================================
+# MiniMax M2 specific behavior:
+# - Model does NOT generate <think> start token
+# - Model only generates </think> end token
+# - All content before </think> is reasoning
+# - All content after </think> is the actual response (content)
+# =============================================================================
+
+# Case: reasoning + end token + content (typical case)
+SIMPLE_REASONING = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+
+# Case: reasoning + end token only (no content after)
+COMPLETE_REASONING = {
+    "output": "This is a reasoning section</think>",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+
+# Case: no end token yet (streaming in progress, all is reasoning)
+NO_END_TOKEN = {
+    "output": "This is reasoning in progress",
+    "reasoning": "This is reasoning in progress",
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+# Case: multiple lines of reasoning
+MULTIPLE_LINES = {
+    "output": "First line\nSecond line</think>Response first line\nResponse second",
+    "reasoning": "First line\nSecond line",
+    "content": "Response first line\nResponse second",
+    "is_reasoning_end": True,
+}
+
+# Case: only end token (empty reasoning, immediate response)
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "</think>This is the response",
+    "reasoning": "",
+    "content": "This is the response",
+    "is_reasoning_end": True,
+}
+
+# Case: only end token streaming (reasoning is None because it's just the token)
+SHORTEST_REASONING_STREAMING = {
+    "output": "</think>This is the response",
+    "reasoning": None,
+    "content": "This is the response",
+    "is_reasoning_end": True,
+}
+
+# Case: empty output
+EMPTY = {
+    "output": "",
+    "reasoning": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+# Case: empty streaming
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+# Case: long reasoning with special characters
+SPECIAL_CHARS = {
+    "output": "Let me think... 1+1=2, right?</think>Yes, 1+1=2.",
+    "reasoning": "Let me think... 1+1=2, right?",
+    "content": "Yes, 1+1=2.",
+    "is_reasoning_end": True,
+}
+
+# Case: reasoning with code blocks
+CODE_IN_REASONING = {
+    "output": "```python\nprint('hello')\n```</think>Here is the code.",
+    "reasoning": "```python\nprint('hello')\n```",
+    "content": "Here is the code.",
+    "is_reasoning_end": True,
+}
+
+TEST_CASES = [
+    # Core cases: no start token (MiniMax M2 actual behavior)
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_END_TOKEN,
+        id="no_end_token",
+    ),
+    pytest.param(
+        True,
+        NO_END_TOKEN,
+        id="no_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_reasoning",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING_STREAMING,
+        id="shortest_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        SPECIAL_CHARS,
+        id="special_chars",
+    ),
+    pytest.param(
+        True,
+        SPECIAL_CHARS,
+        id="special_chars_streaming",
+    ),
+    pytest.param(
+        False,
+        CODE_IN_REASONING,
+        id="code_in_reasoning",
+    ),
+    pytest.param(
+        True,
+        CODE_IN_REASONING,
+        id="code_in_reasoning_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    minimax_m2_tokenizer,
+):
+    output = minimax_m2_tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        minimax_m2_tokenizer
+    )
+
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_ids)
+        assert content == minimax_m2_tokenizer.convert_tokens_to_ids(
+            minimax_m2_tokenizer.tokenize(param_dict["content"])
+        )
+    else:
+        content = parser.extract_content_ids(output)
+        assert content == []
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -19,6 +19,10 @@ logger = init_logger(__name__)
 class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
    """
    Reasoning parser for MiniMax M2 model.
+
+    MiniMax M2 models don't generate <think> start token, only </think> end
+    token. All content before </think> is reasoning, content after is the
+    actual response.
    """

    @property
@@ -31,6 +35,45 @@ class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
        """The token that ends reasoning content."""
        return "</think>"

+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extract reasoning content from a delta message for streaming.
+
+        MiniMax M2 models don't generate <think> start token, so we assume
+        all content is reasoning until we encounter the </think> end token.
+        """
+        # Skip single end token
+        if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
+            return None
+
+        # Check if end token has already appeared in previous tokens
+        # meaning we're past the reasoning phase
+        if self.end_token_id in previous_token_ids:
+            # We're past the reasoning phase, this is content
+            return DeltaMessage(content=delta_text)
+
+        # Check if end token is in delta tokens
+        if self.end_token_id in delta_token_ids:
+            # End token in delta, split reasoning and content
+            end_index = delta_text.find(self.end_token)
+            reasoning = delta_text[:end_index]
+            content = delta_text[end_index + len(self.end_token) :]
+            return DeltaMessage(
+                reasoning=reasoning if reasoning else None,
+                content=content if content else None,
+            )
+
+        # No end token yet, all content is reasoning
+        return DeltaMessage(reasoning=delta_text)
+

 class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
    """