# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser class Qwen3ReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for the Qwen3/Qwen3.5 model family. The Qwen3 model family uses ... tokens to denote reasoning text. Starting with Qwen3.5, the chat template places in the prompt so only appears in the generated output. The model provides a strict switch to disable reasoning output via the 'enable_thinking=False' parameter. When thinking is disabled, the template places \n\n\n\n in the prompt. The serving layer detects this via prompt-side reasoning end checks and routes deltas as content without calling the streaming parser. NOTE: Older templates may still emit in the generated output. This parser handles both styles. """ @property def start_token(self) -> str: """The token that starts reasoning content.""" return "" @property def end_token(self) -> str: """The token that ends reasoning content.""" return "" def extract_reasoning( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. The token is typically placed in the prompt, so only usually appears in the generated output. If is present in the output, strip it before extraction. Returns: tuple[Optional[str], Optional[str]]: reasoning content and content """ # Strip if present in the generated output. model_output_parts = model_output.partition(self.start_token) model_output = ( model_output_parts[2] if model_output_parts[1] else model_output_parts[0] ) if self.end_token not in model_output: # No end token means thinking is disabled or the model # did not produce reasoning. Treat everything as content. return None, model_output reasoning, _, content = model_output.partition(self.end_token) final_content = content or None return reasoning, final_content def extract_reasoning_streaming( self, previous_text: str, current_text: str, delta_text: str, previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], ) -> DeltaMessage | None: """ Extract reasoning content from a streaming delta. Generated tokens before are reasoning and tokens after it are content. If an older template emits in the generated output, strip it from the current delta first. """ if self.start_token_id in delta_token_ids: start_idx = delta_text.find(self.start_token) if start_idx >= 0: delta_text = delta_text[start_idx + len(self.start_token) :] if self.end_token_id in delta_token_ids: end_index = delta_text.find(self.end_token) if end_index >= 0: reasoning = delta_text[:end_index] content = delta_text[end_index + len(self.end_token) :] if not reasoning and not content: return None return DeltaMessage( reasoning=reasoning if reasoning else None, content=content if content else None, ) return None if not delta_text: return None if self.end_token_id in previous_token_ids: return DeltaMessage(content=delta_text) return DeltaMessage(reasoning=delta_text)