fix(reasoning): Qwen3ReasoningParser returns truncated output as reasoning (#35230)

Signed-off-by: stakeswky <stakeswky@users.noreply.github.com> Co-authored-by: stakeswky <stakeswky@users.noreply.github.com>

fix(reasoning): Qwen3ReasoningParser returns truncated output as reasoning (#35230)
Signed-off-by: stakeswky <stakeswky@users.noreply.github.com> Co-authored-by: stakeswky <stakeswky@users.noreply.github.com>
967572dd · 不做了睡大觉 · GitHub · 3d66502e · 967572dd · 967572dd
Unverified Commit 967572dd authored Feb 27, 2026 by 不做了睡大觉 Committed by GitHub Feb 26, 2026
Showing with 97 additions and 10 deletions

tests/reasoning/test_qwen3_reasoning_parser.py tests/reasoning/test_qwen3_reasoning_parser.py +77 -5

vllm/reasoning/qwen3_reasoning_parser.py vllm/reasoning/qwen3_reasoning_parser.py +20 -5

No files found.
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -9,6 +9,7 @@ from tests.reasoning.utils import (
    run_reasoning_extraction,
    run_reasoning_extraction_streaming,
 )
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.reasoning import ReasoningParser, ReasoningParserManager

 parser_name = "qwen3"
@@ -58,12 +59,14 @@ WITH_THINK_STREAM = {
    "content": "This is the rest",
 }

-# --- No think tokens at all (thinking disabled) ---
+# --- No think tokens at all (thinking enabled, truncated) ---

+# With thinking enabled (default), no think tokens means the output was
+# truncated before </think> could be generated. All output is reasoning.
 WITHOUT_THINK = {
    "output": "This is the rest",
-    "reasoning": None,
-    "content": "This is the rest",
+    "reasoning": "This is the rest",
+    "content": None,
 }
 # In streaming, the parser cannot distinguish "thinking disabled" from
 # "reasoning in progress" when no think tokens have appeared yet.
@@ -87,10 +90,12 @@ MULTILINE_REASONING = {
    "reasoning": "This is a reasoning\nsection",
    "content": "This is the rest\nThat",
 }
+# Truncated output: <think> present but no </think> (thinking enabled).
+# Everything is reasoning because the output was cut off mid-thought.
 ONLY_OPEN_TAG = {
    "output": "<think>This is a reasoning section",
-    "reasoning": None,
-    "content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
+    "content": None,
 }

 ONLY_OPEN_TAG_STREAM = {
@@ -99,6 +104,20 @@ ONLY_OPEN_TAG_STREAM = {
    "content": None,
 }

+# Truncated output without <think> prefix (Qwen3.5 style where <think>
+# is in the prompt). No </think> means truncation — all is reasoning.
+TRUNCATED_NO_START_TOKEN = {
+    "output": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+}
+
+TRUNCATED_NO_START_TOKEN_STREAM = {
+    "output": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+}
+
 TEST_CASES = [
    pytest.param(
        False,
@@ -170,6 +189,16 @@ TEST_CASES = [
        ONLY_OPEN_TAG_STREAM,
        id="only_open_tag_stream",
    ),
+    pytest.param(
+        False,
+        TRUNCATED_NO_START_TOKEN,
+        id="truncated_no_start_token",
+    ),
+    pytest.param(
+        True,
+        TRUNCATED_NO_START_TOKEN_STREAM,
+        id="truncated_no_start_token_stream",
+    ),
 ]


@@ -249,3 +278,46 @@ def test_reasoning_streaming_multi_token_deltas(

    assert reconstructor.reasoning == expected_reasoning
    assert (reconstructor.other_content or None) == expected_content
+
+
+# --- Tests for enable_thinking=False (thinking explicitly disabled) ---
+
+
+THINKING_DISABLED_CASES = [
+    pytest.param(
+        "This is plain content",
+        None,
+        "This is plain content",
+        id="thinking_disabled_plain_content",
+    ),
+    pytest.param(
+        "Some output without think tokens",
+        None,
+        "Some output without think tokens",
+        id="thinking_disabled_no_think_tokens",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "output, expected_reasoning, expected_content", THINKING_DISABLED_CASES
+)
+def test_reasoning_thinking_disabled(
+    output: str,
+    expected_reasoning: str | None,
+    expected_content: str | None,
+    qwen3_tokenizer,
+):
+    """When enable_thinking=False, output without </think> is all content."""
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        qwen3_tokenizer,
+        chat_template_kwargs={"enable_thinking": False},
+    )
+
+    reasoning, content = parser.extract_reasoning(
+        model_output=output,
+        request=ChatCompletionRequest(messages=[], model="test-model"),
+    )
+
+    assert reasoning == expected_reasoning
+    assert content == expected_content
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -11,6 +11,7 @@ from vllm.entrypoints.openai.responses.protocol import (
    ResponsesRequest,
 )
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.tokenizers import TokenizerLike


 class Qwen3ReasoningParser(BaseThinkingReasoningParser):
@@ -33,6 +34,14 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
    it is stripped before extraction (non-streaming) or skipped (streaming).
    """

+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        # Qwen3 defaults to thinking enabled; only treat output as
+        # pure content when the user explicitly disables it.
+        self.thinking_enabled = chat_kwargs.get("enable_thinking", True)
+
    @property
    def start_token(self) -> str:
        """The token that starts reasoning content."""
@@ -54,8 +63,11 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
        If <think> is present (e.g. from a different template), it is
        stripped before extraction.

-        When thinking is disabled (no </think> in output), returns
-        (None, model_output) to indicate all output is content.
+        When thinking is explicitly disabled and no </think> appears,
+        returns (None, model_output) — all output is content.
+        Otherwise (thinking enabled, default), a missing </think> means
+        the output was truncated and everything is reasoning:
+        returns (model_output, None).

        Returns:
            tuple[Optional[str], Optional[str]]: reasoning content and content
@@ -68,9 +80,12 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
        )

        if self.end_token not in model_output:
-            # No end token means thinking is disabled or the model
-            # did not produce reasoning. Treat everything as content.
-            return None, model_output
+            if not self.thinking_enabled:
+                # Thinking explicitly disabled — treat everything as content.
+                return None, model_output
+            # Thinking enabled but no </think>: output was truncated.
+            # Everything generated so far is reasoning.
+            return model_output, None

        # Extract reasoning content from the model output.
        reasoning, _, content = model_output.partition(self.end_token)