Unverified Commit 967572dd authored by 不做了睡大觉's avatar 不做了睡大觉 Committed by GitHub
Browse files

fix(reasoning): Qwen3ReasoningParser returns truncated output as reasoning (#35230)


Signed-off-by: default avatarstakeswky <stakeswky@users.noreply.github.com>
Co-authored-by: default avatarstakeswky <stakeswky@users.noreply.github.com>
parent 3d66502e
...@@ -9,6 +9,7 @@ from tests.reasoning.utils import ( ...@@ -9,6 +9,7 @@ from tests.reasoning.utils import (
run_reasoning_extraction, run_reasoning_extraction,
run_reasoning_extraction_streaming, run_reasoning_extraction_streaming,
) )
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "qwen3" parser_name = "qwen3"
...@@ -58,12 +59,14 @@ WITH_THINK_STREAM = { ...@@ -58,12 +59,14 @@ WITH_THINK_STREAM = {
"content": "This is the rest", "content": "This is the rest",
} }
# --- No think tokens at all (thinking disabled) --- # --- No think tokens at all (thinking enabled, truncated) ---
# With thinking enabled (default), no think tokens means the output was
# truncated before </think> could be generated. All output is reasoning.
WITHOUT_THINK = { WITHOUT_THINK = {
"output": "This is the rest", "output": "This is the rest",
"reasoning": None, "reasoning": "This is the rest",
"content": "This is the rest", "content": None,
} }
# In streaming, the parser cannot distinguish "thinking disabled" from # In streaming, the parser cannot distinguish "thinking disabled" from
# "reasoning in progress" when no think tokens have appeared yet. # "reasoning in progress" when no think tokens have appeared yet.
...@@ -87,10 +90,12 @@ MULTILINE_REASONING = { ...@@ -87,10 +90,12 @@ MULTILINE_REASONING = {
"reasoning": "This is a reasoning\nsection", "reasoning": "This is a reasoning\nsection",
"content": "This is the rest\nThat", "content": "This is the rest\nThat",
} }
# Truncated output: <think> present but no </think> (thinking enabled).
# Everything is reasoning because the output was cut off mid-thought.
ONLY_OPEN_TAG = { ONLY_OPEN_TAG = {
"output": "<think>This is a reasoning section", "output": "<think>This is a reasoning section",
"reasoning": None, "reasoning": "This is a reasoning section",
"content": "This is a reasoning section", "content": None,
} }
ONLY_OPEN_TAG_STREAM = { ONLY_OPEN_TAG_STREAM = {
...@@ -99,6 +104,20 @@ ONLY_OPEN_TAG_STREAM = { ...@@ -99,6 +104,20 @@ ONLY_OPEN_TAG_STREAM = {
"content": None, "content": None,
} }
# Truncated output without <think> prefix (Qwen3.5 style where <think>
# is in the prompt). No </think> means truncation — all is reasoning.
TRUNCATED_NO_START_TOKEN = {
"output": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
}
TRUNCATED_NO_START_TOKEN_STREAM = {
"output": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
}
TEST_CASES = [ TEST_CASES = [
pytest.param( pytest.param(
False, False,
...@@ -170,6 +189,16 @@ TEST_CASES = [ ...@@ -170,6 +189,16 @@ TEST_CASES = [
ONLY_OPEN_TAG_STREAM, ONLY_OPEN_TAG_STREAM,
id="only_open_tag_stream", id="only_open_tag_stream",
), ),
pytest.param(
False,
TRUNCATED_NO_START_TOKEN,
id="truncated_no_start_token",
),
pytest.param(
True,
TRUNCATED_NO_START_TOKEN_STREAM,
id="truncated_no_start_token_stream",
),
] ]
...@@ -249,3 +278,46 @@ def test_reasoning_streaming_multi_token_deltas( ...@@ -249,3 +278,46 @@ def test_reasoning_streaming_multi_token_deltas(
assert reconstructor.reasoning == expected_reasoning assert reconstructor.reasoning == expected_reasoning
assert (reconstructor.other_content or None) == expected_content assert (reconstructor.other_content or None) == expected_content
# --- Tests for enable_thinking=False (thinking explicitly disabled) ---
THINKING_DISABLED_CASES = [
pytest.param(
"This is plain content",
None,
"This is plain content",
id="thinking_disabled_plain_content",
),
pytest.param(
"Some output without think tokens",
None,
"Some output without think tokens",
id="thinking_disabled_no_think_tokens",
),
]
@pytest.mark.parametrize(
"output, expected_reasoning, expected_content", THINKING_DISABLED_CASES
)
def test_reasoning_thinking_disabled(
output: str,
expected_reasoning: str | None,
expected_content: str | None,
qwen3_tokenizer,
):
"""When enable_thinking=False, output without </think> is all content."""
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
qwen3_tokenizer,
chat_template_kwargs={"enable_thinking": False},
)
reasoning, content = parser.extract_reasoning(
model_output=output,
request=ChatCompletionRequest(messages=[], model="test-model"),
)
assert reasoning == expected_reasoning
assert content == expected_content
...@@ -11,6 +11,7 @@ from vllm.entrypoints.openai.responses.protocol import ( ...@@ -11,6 +11,7 @@ from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest, ResponsesRequest,
) )
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
from vllm.tokenizers import TokenizerLike
class Qwen3ReasoningParser(BaseThinkingReasoningParser): class Qwen3ReasoningParser(BaseThinkingReasoningParser):
...@@ -33,6 +34,14 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): ...@@ -33,6 +34,14 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
it is stripped before extraction (non-streaming) or skipped (streaming). it is stripped before extraction (non-streaming) or skipped (streaming).
""" """
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
# Qwen3 defaults to thinking enabled; only treat output as
# pure content when the user explicitly disables it.
self.thinking_enabled = chat_kwargs.get("enable_thinking", True)
@property @property
def start_token(self) -> str: def start_token(self) -> str:
"""The token that starts reasoning content.""" """The token that starts reasoning content."""
...@@ -54,8 +63,11 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): ...@@ -54,8 +63,11 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
If <think> is present (e.g. from a different template), it is If <think> is present (e.g. from a different template), it is
stripped before extraction. stripped before extraction.
When thinking is disabled (no </think> in output), returns When thinking is explicitly disabled and no </think> appears,
(None, model_output) to indicate all output is content. returns (None, model_output) — all output is content.
Otherwise (thinking enabled, default), a missing </think> means
the output was truncated and everything is reasoning:
returns (model_output, None).
Returns: Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content tuple[Optional[str], Optional[str]]: reasoning content and content
...@@ -68,9 +80,12 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): ...@@ -68,9 +80,12 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
) )
if self.end_token not in model_output: if self.end_token not in model_output:
# No end token means thinking is disabled or the model if not self.thinking_enabled:
# did not produce reasoning. Treat everything as content. # Thinking explicitly disabled — treat everything as content.
return None, model_output return None, model_output
# Thinking enabled but no </think>: output was truncated.
# Everything generated so far is reasoning.
return model_output, None
# Extract reasoning content from the model output. # Extract reasoning content from the model output.
reasoning, _, content = model_output.partition(self.end_token) reasoning, _, content = model_output.partition(self.end_token)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment