qwen3_reasoning_parser.py 5.21 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
from collections.abc import Sequence

6
7
8
from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
)
9
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
10
11
12
from vllm.entrypoints.openai.responses.protocol import (
    ResponsesRequest,
)
13
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
14
15


16
class Qwen3ReasoningParser(BaseThinkingReasoningParser):
17
    """
18
19
20
21
22
23
24
25
26
27
28
    Reasoning parser for the Qwen3/Qwen3.5 model family.

    The Qwen3 model family uses <think>...</think> tokens to denote reasoning
    text. Starting with Qwen3.5, the chat template places <think> in the
    prompt so only </think> appears in the generated output. The model
    provides a strict switch to disable reasoning output via the
    'enable_thinking=False' parameter.

    When thinking is disabled, the template places <think>\\n\\n</think>\\n\\n
    in the prompt. The serving layer detects this via prompt_is_reasoning_end
    and routes deltas as content without calling the streaming parser.
29

30
31
32
33
    NOTE: Models up to the 2507 release (e.g., Qwen/Qwen3-235B-A22B-Instruct-2507)
    use an older chat template where the model generates <think> itself.
    This parser handles both styles: if <think> appears in the generated output
    it is stripped before extraction (non-streaming) or skipped (streaming).
34
35
    """

36
37
38
39
    @property
    def start_token(self) -> str:
        """The token that starts reasoning content."""
        return "<think>"
40

41
42
43
44
    @property
    def end_token(self) -> str:
        """The token that ends reasoning content."""
        return "</think>"
45

46
    def extract_reasoning(
47
48
        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
    ) -> tuple[str | None, str | None]:
49
50
        """
        Extract reasoning content from the model output.
51

52
53
54
55
        The <think> token is placed in the prompt by the chat template,
        so typically only </think> appears in the generated output.
        If <think> is present (e.g. from a different template), it is
        stripped before extraction.
56

57
58
        When thinking is disabled (no </think> in output), returns
        (None, model_output) to indicate all output is content.
59

60
61
62
63
        Returns:
            tuple[Optional[str], Optional[str]]: reasoning content and content
        """

64
        # Strip <think> if present in the generated output.
65
        model_output_parts = model_output.partition(self.start_token)
66
67
68
        model_output = (
            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
        )
69
70

        if self.end_token not in model_output:
71
72
            # No end token means thinking is disabled or the model
            # did not produce reasoning. Treat everything as content.
73
74
75
            return None, model_output

        # Extract reasoning content from the model output.
76
        reasoning, _, content = model_output.partition(self.end_token)
77
78

        final_content = content or None
79
        return reasoning, final_content
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extract reasoning content from a streaming delta.

        Since <think> is placed in the prompt by the chat template, all
        generated tokens before </think> are reasoning and tokens after
        are content.

        NOTE: When thinking is disabled, no think tokens appear in the
        generated output. The serving layer detects this via
        prompt_is_reasoning_end and routes deltas as content without
        calling this method.
        """
        # Strip <think> from delta if present (old template / edge case
        # where the model generates <think> itself).
        if self.start_token_id in delta_token_ids:
            start_idx = delta_text.find(self.start_token)
            if start_idx >= 0:
                delta_text = delta_text[start_idx + len(self.start_token) :]

        if self.end_token_id in delta_token_ids:
            # End token in this delta: split reasoning from content.
            end_index = delta_text.find(self.end_token)
            if end_index >= 0:
                reasoning = delta_text[:end_index]
                content = delta_text[end_index + len(self.end_token) :]
                if not reasoning and not content:
                    return None
                return DeltaMessage(
                    reasoning=reasoning if reasoning else None,
                    content=content if content else None,
                )
            # end_token_id in IDs but not in text (already stripped)
            return None

        # No end token in this delta.
        if not delta_text:
            # Nothing left after stripping start token.
            return None
        elif self.end_token_id in previous_token_ids:
            # End token already passed: everything is content now.
            return DeltaMessage(content=delta_text)
        else:
            # No end token yet: still in reasoning phase.
            return DeltaMessage(reasoning=delta_text)