Fix Qwen3/Qwen3.5 Reasoning Parser (#34779)

9c122e36 · jujl1 · 57979f97 · 9c122e36 · 9c122e36
Commit 9c122e36 authored Mar 16, 2026 by jujl1
Showing with 110 additions and 35 deletions

vllm/entrypoints/openai/chat_completion/serving.py vllm/entrypoints/openai/chat_completion/serving.py +29 -16

vllm/reasoning/qwen3_reasoning_parser.py vllm/reasoning/qwen3_reasoning_parser.py +81 -19

No files found.
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -905,6 +905,17 @@ class OpenAIServingChat(OpenAIServing):
                        harmony_tools_streamed[i] |= tools_streamed_flag
                    # handle streaming deltas for tools with named tool_choice
                    elif tool_choice_function_name:
+                        # When encountering think end id in prompt_token_ids
+                        # i.e {"enable_thinking": False},
+                        # check BEFORE calling the parser to avoid a spurious
+                        # reasoning delta on the first chunk.
+                        if (
+                            reasoning_parser
+                            and not reasoning_end_arr[i]
+                            and prompt_is_reasoning_end_arr[i]
+                        ):
+                            reasoning_end_arr[i] = True
+
                        if (
                            self.reasoning_parser
                            and not reasoning_end_arr[i]
@@ -923,18 +934,11 @@ class OpenAIServingChat(OpenAIServing):
                                    output.token_ids,
                                )
                            )
-                            # When encountering think end id in delta_token_ids
-                            # or think end id in prompt_token_ids
-                            # i.e {"enable_thinking": False},
+                            # When encountering think end id in delta_token_ids,
                            # set reasoning status to end.
                            # Only keep 'content', remove 'reasoning'.
                            if reasoning_parser.is_reasoning_end(
                                as_list(output.token_ids)
-                            ) or (
-                                res.prompt_token_ids
-                                and reasoning_parser.is_reasoning_end(
-                                    res.prompt_token_ids
-                                )
                            ):
                                reasoning_end_arr[i] = True
                                if delta_message and delta_message.content:
@@ -1120,7 +1124,15 @@ class OpenAIServingChat(OpenAIServing):

                    # when only reasoning
                    elif self.reasoning_parser:
-                        delta_message = reasoning_parser.extract_reasoning_streaming(
+                        # When encountering think end id in prompt_token_ids
+                        # i.e {"enable_thinking": False},
+                        # set reasoning status to end.
+                        # Route all generated tokens as content directly.
+                        if prompt_is_reasoning_end_arr[i]:
+                            delta_message = DeltaMessage(content=delta_text)
+                        else:
+                            delta_message = (
+                                reasoning_parser.extract_reasoning_streaming(
                                    previous_text,
                                    current_text,
                                    delta_text,
@@ -1128,6 +1140,7 @@ class OpenAIServingChat(OpenAIServing):
                                    current_token_ids,
                                    output.token_ids,
                                )
+                            )
                    # handle streaming just a content delta
                    else:
                        delta_message = DeltaMessage(content=delta_text)

--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from collections.abc import Sequence
+
 from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
 )
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.entrypoints.openai.responses.protocol import (
    ResponsesRequest,
 )
@@ -12,13 +15,22 @@ from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser

 class Qwen3ReasoningParser(BaseThinkingReasoningParser):
    """
-    Reasoning parser for the Qwen3 model.
+    Reasoning parser for the Qwen3/Qwen3.5 model family.
+
+    The Qwen3 model family uses <think>...</think> tokens to denote reasoning
+    text. Starting with Qwen3.5, the chat template places <think> in the
+    prompt so only </think> appears in the generated output. The model
+    provides a strict switch to disable reasoning output via the
+    'enable_thinking=False' parameter.

-    The Qwen3 model uses <think>...</think> tokens to denote reasoning text
-    within its output. The model provides a strict switch to disable reasoning
-    output via the 'enable_thinking=False' parameter. This parser extracts the
-    reasoning content enclosed by <think> and </think> tokens from the model's
-    output.
+    When thinking is disabled, the template places <think>\\n\\n</think>\\n\\n
+    in the prompt. The serving layer detects this via prompt_is_reasoning_end
+    and routes deltas as content without calling the streaming parser.
+
+    NOTE: Models up to the 2507 release (e.g., Qwen/Qwen3-235B-A22B-Instruct-2507)
+    use an older chat template where the model generates <think> itself.
+    This parser handles both styles: if <think> appears in the generated output
+    it is stripped before extraction (non-streaming) or skipped (streaming).
    """

    @property
@@ -37,31 +49,27 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
        """
        Extract reasoning content from the model output.

-        Qwen3 has stricter requirements - it needs both start and end tokens
-        to be present, unlike other models that work with just the end token.
+        The <think> token is placed in the prompt by the chat template,
+        so typically only </think> appears in the generated output.
+        If <think> is present (e.g. from a different template), it is
+        stripped before extraction.

-        For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning
-        - 'xyz' goes to content
+        When thinking is disabled (no </think> in output), returns
+        (None, model_output) to indicate all output is content.

        Returns:
            tuple[Optional[str], Optional[str]]: reasoning content and content
        """

-        # Check if the model output contains both <think> and </think> tokens.
-        if self.start_token not in model_output or self.end_token not in model_output:
-            return None, model_output
-
-        # Check if the <think> is present in the model output, remove it
-        # if it is present.
+        # Strip <think> if present in the generated output.
        model_output_parts = model_output.partition(self.start_token)
        model_output = (
            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
        )

-        # Check if the model output contains the </think> tokens.
-        # If the end token is not found, return the model output as is.
        if self.end_token not in model_output:
+            # No end token means thinking is disabled or the model
+            # did not produce reasoning. Treat everything as content.
            return None, model_output

        # Extract reasoning content from the model output.
@@ -69,3 +77,57 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):

        final_content = content or None
        return reasoning, final_content
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extract reasoning content from a streaming delta.
+
+        Since <think> is placed in the prompt by the chat template, all
+        generated tokens before </think> are reasoning and tokens after
+        are content.
+
+        NOTE: When thinking is disabled, no think tokens appear in the
+        generated output. The serving layer detects this via
+        prompt_is_reasoning_end and routes deltas as content without
+        calling this method.
+        """
+        # Strip <think> from delta if present (old template / edge case
+        # where the model generates <think> itself).
+        if self.start_token_id in delta_token_ids:
+            start_idx = delta_text.find(self.start_token)
+            if start_idx >= 0:
+                delta_text = delta_text[start_idx + len(self.start_token) :]
+
+        if self.end_token_id in delta_token_ids:
+            # End token in this delta: split reasoning from content.
+            end_index = delta_text.find(self.end_token)
+            if end_index >= 0:
+                reasoning = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token) :]
+                if not reasoning and not content:
+                    return None
+                return DeltaMessage(
+                    reasoning=reasoning if reasoning else None,
+                    content=content if content else None,
+                )
+            # end_token_id in IDs but not in text (already stripped)
+            return None
+
+        # No end token in this delta.
+        if not delta_text:
+            # Nothing left after stripping start token.
+            return None
+        elif self.end_token_id in previous_token_ids:
+            # End token already passed: everything is content now.
+            return DeltaMessage(content=delta_text)
+        else:
+            # No end token yet: still in reasoning phase.
+            return DeltaMessage(reasoning=delta_text)
\ No newline at end of file