qwen3_reasoning_parser.py 4.12 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
from collections.abc import Sequence

6
7
8
from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
)
9
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
10
11
12
from vllm.entrypoints.openai.responses.protocol import (
    ResponsesRequest,
)
13
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
14
15


16
class Qwen3ReasoningParser(BaseThinkingReasoningParser):
17
    """
18
19
20
21
22
23
24
25
    Reasoning parser for the Qwen3/Qwen3.5 model family.

    The Qwen3 model family uses <think>...</think> tokens to denote reasoning
    text. Starting with Qwen3.5, the chat template places <think> in the
    prompt so only </think> appears in the generated output. The model
    provides a strict switch to disable reasoning output via the
    'enable_thinking=False' parameter.

26
27
28
29
30
31
32
    When thinking is disabled, the template places <think>\n\n</think>\n\n
    in the prompt. The serving layer detects this via prompt-side reasoning
    end checks and routes deltas as content without calling the streaming
    parser.

    NOTE: Older templates may still emit <think> in the generated output.
    This parser handles both styles.
33
34
    """

35
36
37
38
    @property
    def start_token(self) -> str:
        """The token that starts reasoning content."""
        return "<think>"
39

40
41
42
43
    @property
    def end_token(self) -> str:
        """The token that ends reasoning content."""
        return "</think>"
44

45
    def extract_reasoning(
46
47
        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
    ) -> tuple[str | None, str | None]:
48
49
        """
        Extract reasoning content from the model output.
50

51
52
53
        The <think> token is typically placed in the prompt, so only
        </think> usually appears in the generated output. If <think> is
        present in the output, strip it before extraction.
54

55
56
57
58
        Returns:
            tuple[Optional[str], Optional[str]]: reasoning content and content
        """

59
        # Strip <think> if present in the generated output.
60
        model_output_parts = model_output.partition(self.start_token)
61
62
63
        model_output = (
            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
        )
64
65

        if self.end_token not in model_output:
66
67
            # No end token means thinking is disabled or the model
            # did not produce reasoning. Treat everything as content.
68
69
            return None, model_output

70
        reasoning, _, content = model_output.partition(self.end_token)
71
72

        final_content = content or None
73
        return reasoning, final_content
74
75
76
77
78
79
80
81
82
83
84
85
86

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extract reasoning content from a streaming delta.

87
88
89
        Generated tokens before </think> are reasoning and tokens after it
        are content. If an older template emits <think> in the generated
        output, strip it from the current delta first.
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
        """
        if self.start_token_id in delta_token_ids:
            start_idx = delta_text.find(self.start_token)
            if start_idx >= 0:
                delta_text = delta_text[start_idx + len(self.start_token) :]

        if self.end_token_id in delta_token_ids:
            end_index = delta_text.find(self.end_token)
            if end_index >= 0:
                reasoning = delta_text[:end_index]
                content = delta_text[end_index + len(self.end_token) :]
                if not reasoning and not content:
                    return None
                return DeltaMessage(
                    reasoning=reasoning if reasoning else None,
                    content=content if content else None,
                )
            return None

        if not delta_text:
            return None
111
        if self.end_token_id in previous_token_ids:
112
            return DeltaMessage(content=delta_text)
113
        return DeltaMessage(reasoning=delta_text)