fix prompt_is_reasoning_end_arr not define

1e7e69d9 · renzhc · 7676d0c9 · 1e7e69d9 · 1e7e69d9
Commit 1e7e69d9 authored Mar 18, 2026 by renzhc
Showing with 32 additions and 53 deletions

vllm/entrypoints/openai/chat_completion/serving.py vllm/entrypoints/openai/chat_completion/serving.py +17 -18

vllm/reasoning/qwen3_reasoning_parser.py vllm/reasoning/qwen3_reasoning_parser.py +15 -35

No files found.
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -822,6 +822,15 @@ class OpenAIServingChat(OpenAIServing):
                                yield f"data: {data}\n\n"
                    first_iteration = False

+                prompt_is_reasoning_end_arr = [False] * num_choices
+                if self.reasoning_parser and res.prompt_token_ids:
+                    prompt_is_reasoning_end = reasoning_parser.is_reasoning_end(
+                        res.prompt_token_ids
+                    )
+                    prompt_is_reasoning_end_arr = [
+                        prompt_is_reasoning_end
+                    ] * num_choices
+
                for output in res.outputs:
                    i = output.index
                    tool_parser = tool_parsers[i]
@@ -905,10 +914,6 @@ class OpenAIServingChat(OpenAIServing):
                        harmony_tools_streamed[i] |= tools_streamed_flag
                    # handle streaming deltas for tools with named tool_choice
                    elif tool_choice_function_name:
-                        # When encountering think end id in prompt_token_ids
-                        # i.e {"enable_thinking": False},
-                        # check BEFORE calling the parser to avoid a spurious
-                        # reasoning delta on the first chunk.
                        if (
                            reasoning_parser
                            and not reasoning_end_arr[i]
@@ -1124,22 +1129,16 @@ class OpenAIServingChat(OpenAIServing):

                    # when only reasoning
                    elif self.reasoning_parser:
-                        # When encountering think end id in prompt_token_ids
-                        # i.e {"enable_thinking": False},
-                        # set reasoning status to end.
-                        # Route all generated tokens as content directly.
                        if prompt_is_reasoning_end_arr[i]:
                            delta_message = DeltaMessage(content=delta_text)
                        else:
-                            delta_message = (
-                                reasoning_parser.extract_reasoning_streaming(
-                                    previous_text,
-                                    current_text,
-                                    delta_text,
-                                    previous_token_ids,
-                                    current_token_ids,
-                                    output.token_ids,
-                                )
+                            delta_message = reasoning_parser.extract_reasoning_streaming(
+                                previous_text,
+                                current_text,
+                                delta_text,
+                                previous_token_ids,
+                                current_token_ids,
+                                output.token_ids,
                            )
                    # handle streaming just a content delta
                    else:
@@ -1941,4 +1940,4 @@ class OpenAIServingChat(OpenAIServing):
        if request.cache_salt is not None:
            engine_prompt["cache_salt"] = request.cache_salt

-        return messages, [engine_prompt]
\ No newline at end of file
+        return messages, [engine_prompt]
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -23,14 +23,13 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
    provides a strict switch to disable reasoning output via the
    'enable_thinking=False' parameter.

-    When thinking is disabled, the template places <think>\\n\\n</think>\\n\\n
-    in the prompt. The serving layer detects this via prompt_is_reasoning_end
-    and routes deltas as content without calling the streaming parser.
-
-    NOTE: Models up to the 2507 release (e.g., Qwen/Qwen3-235B-A22B-Instruct-2507)
-    use an older chat template where the model generates <think> itself.
-    This parser handles both styles: if <think> appears in the generated output
-    it is stripped before extraction (non-streaming) or skipped (streaming).
+    When thinking is disabled, the template places <think>\n\n</think>\n\n
+    in the prompt. The serving layer detects this via prompt-side reasoning
+    end checks and routes deltas as content without calling the streaming
+    parser.
+
+    NOTE: Older templates may still emit <think> in the generated output.
+    This parser handles both styles.
    """

    @property
@@ -49,13 +48,9 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
        """
        Extract reasoning content from the model output.

-        The <think> token is placed in the prompt by the chat template,
-        so typically only </think> appears in the generated output.
-        If <think> is present (e.g. from a different template), it is
-        stripped before extraction.
-
-        When thinking is disabled (no </think> in output), returns
-        (None, model_output) to indicate all output is content.
+        The <think> token is typically placed in the prompt, so only
+        </think> usually appears in the generated output. If <think> is
+        present in the output, strip it before extraction.

        Returns:
            tuple[Optional[str], Optional[str]]: reasoning content and content
@@ -72,7 +67,6 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
            # did not produce reasoning. Treat everything as content.
            return None, model_output

-        # Extract reasoning content from the model output.
        reasoning, _, content = model_output.partition(self.end_token)

        final_content = content or None
@@ -90,24 +84,16 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
        """
        Extract reasoning content from a streaming delta.

-        Since <think> is placed in the prompt by the chat template, all
-        generated tokens before </think> are reasoning and tokens after
-        are content.
-
-        NOTE: When thinking is disabled, no think tokens appear in the
-        generated output. The serving layer detects this via
-        prompt_is_reasoning_end and routes deltas as content without
-        calling this method.
+        Generated tokens before </think> are reasoning and tokens after it
+        are content. If an older template emits <think> in the generated
+        output, strip it from the current delta first.
        """
-        # Strip <think> from delta if present (old template / edge case
-        # where the model generates <think> itself).
        if self.start_token_id in delta_token_ids:
            start_idx = delta_text.find(self.start_token)
            if start_idx >= 0:
                delta_text = delta_text[start_idx + len(self.start_token) :]

        if self.end_token_id in delta_token_ids:
-            # End token in this delta: split reasoning from content.
            end_index = delta_text.find(self.end_token)
            if end_index >= 0:
                reasoning = delta_text[:end_index]
@@ -118,16 +104,10 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
                    reasoning=reasoning if reasoning else None,
                    content=content if content else None,
                )
-            # end_token_id in IDs but not in text (already stripped)
            return None

-        # No end token in this delta.
        if not delta_text:
-            # Nothing left after stripping start token.
            return None
-        elif self.end_token_id in previous_token_ids:
-            # End token already passed: everything is content now.
+        if self.end_token_id in previous_token_ids:
            return DeltaMessage(content=delta_text)
-        else:
-            # No end token yet: still in reasoning phase.
-            return DeltaMessage(reasoning=delta_text)
\ No newline at end of file
+        return DeltaMessage(reasoning=delta_text)