# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
class Qwen3ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for the Qwen3/Qwen3.5 model family.
The Qwen3 model family uses ... tokens to denote reasoning
text. Starting with Qwen3.5, the chat template places in the
prompt so only appears in the generated output. The model
provides a strict switch to disable reasoning output via the
'enable_thinking=False' parameter.
When thinking is disabled, the template places \n\n\n\n
in the prompt. The serving layer detects this via prompt-side reasoning
end checks and routes deltas as content without calling the streaming
parser.
NOTE: Older templates may still emit in the generated output.
This parser handles both styles.
"""
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
return ""
@property
def end_token(self) -> str:
"""The token that ends reasoning content."""
return ""
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from the model output.
The token is typically placed in the prompt, so only
usually appears in the generated output. If is
present in the output, strip it before extraction.
Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content
"""
# Strip if present in the generated output.
model_output_parts = model_output.partition(self.start_token)
model_output = (
model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
)
if self.end_token not in model_output:
# No end token means thinking is disabled or the model
# did not produce reasoning. Treat everything as content.
return None, model_output
reasoning, _, content = model_output.partition(self.end_token)
final_content = content or None
return reasoning, final_content
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a streaming delta.
Generated tokens before are reasoning and tokens after it
are content. If an older template emits in the generated
output, strip it from the current delta first.
"""
if self.start_token_id in delta_token_ids:
start_idx = delta_text.find(self.start_token)
if start_idx >= 0:
delta_text = delta_text[start_idx + len(self.start_token) :]
if self.end_token_id in delta_token_ids:
end_index = delta_text.find(self.end_token)
if end_index >= 0:
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
if not reasoning and not content:
return None
return DeltaMessage(
reasoning=reasoning if reasoning else None,
content=content if content else None,
)
return None
if not delta_text:
return None
if self.end_token_id in previous_token_ids:
return DeltaMessage(content=delta_text)
return DeltaMessage(reasoning=delta_text)