basic_parsers.py 7.63 KB
Newer Older
1
2
3
4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from abc import abstractmethod
5
6
from collections.abc import Iterable, Sequence
from itertools import islice
7
from typing import TYPE_CHECKING
8

9
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
10
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
11
from vllm.tokenizers import TokenizerLike
12

13
if TYPE_CHECKING:
14
15
    from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
    from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
16

17
18
19
20

class BaseThinkingReasoningParser(ReasoningParser):
    """
    Base class for reasoning parsers that use thinking tokens.
21

22
23
24
    This class provides common functionality for parsers that use start and end
    tokens to delimit reasoning content (
        e.g., <think>...</think>, <seed:think>...</seed:think>).
25

26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
    Subclasses must implement the start and end tokens via abstract
    properties.
    """

    @property
    @abstractmethod
    def start_token(self) -> str:
        """The token that starts reasoning content."""
        raise NotImplementedError

    @property
    @abstractmethod
    def end_token(self) -> str:
        """The token that ends reasoning content."""
        raise NotImplementedError

42
43
44
45
46
47
48
49
    @property
    def reasoning_start_str(self) -> str:
        return self.start_token

    @property
    def reasoning_end_str(self) -> str:
        return self.end_token

50
    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
51
        super().__init__(tokenizer, *args, **kwargs)
52
53
54
55

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ReasoningParser "
56
57
                "constructor during construction."
            )
58
59

        if not self.start_token or not self.end_token:
60
            raise ValueError("start_token and end_token must be defined in subclasses")
61

62
63
64
        start_token_id = self.vocab.get(self.start_token)
        end_token_id = self.vocab.get(self.end_token)
        if start_token_id is None or end_token_id is None:
65
66
            raise RuntimeError(
                f"{self.__class__.__name__} reasoning parser could not locate "
67
68
                "think start/end tokens in the tokenizer!"
            )
69
70
        self.start_token_id: int = start_token_id
        self.end_token_id: int = end_token_id
71

72
    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
73
        start_token_id = self.start_token_id
74
        end_token_id = self.end_token_id
75
76
77
78
79
80
81

        for i in range(len(input_ids) - 1, -1, -1):
            if input_ids[i] == start_token_id:
                return False
            if input_ids[i] == end_token_id:
                return True
        return False
82

83
    def is_reasoning_end_streaming(
84
        self, input_ids: Sequence[int], delta_ids: Iterable[int]
85
86
87
88
    ) -> bool:
        end_token_id = self.end_token_id
        return end_token_id in delta_ids

89
90
91
92
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        """
        Extract the content after the end tokens
        """
93
        if self.end_token_id not in islice(input_ids, 0, max(0, len(input_ids) - 1)):
94
95
            return []
        else:
96
            return input_ids[input_ids.index(self.end_token_id) + 1 :]
97

98
    def extract_reasoning_streaming(
99
100
101
102
103
104
105
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
106
    ) -> DeltaMessage | None:
107
108
109
110
111
112
        """
        Extract reasoning content from a delta message.
        Handles streaming output where previous + delta = current.
        Uses token IDs for faster processing.
        """
        # Skip single special tokens
113
114
115
        if len(delta_token_ids) == 1 and (
            delta_token_ids[0] in [self.start_token_id, self.end_token_id]
        ):
116
117
118
119
120
121
122
123
124
            return None

        # Check if start token is present in previous or delta.
        # Keep compatibility with models that don't generate start tokens.
        if self.start_token_id in previous_token_ids:
            if self.end_token_id in delta_token_ids:
                # start token in previous, end token in delta,
                # extract reasoning content
                end_index = delta_text.find(self.end_token)
125
                reasoning = delta_text[:end_index]
126
                content = delta_text[end_index + len(self.end_token) :]
127
                return DeltaMessage(
128
                    reasoning=reasoning, content=content if content else None
129
130
131
132
133
134
135
136
                )
            elif self.end_token_id in previous_token_ids:
                # start token in previous, end token in previous,
                # reasoning content continues
                return DeltaMessage(content=delta_text)
            else:
                # start token in previous, no end token in previous or delta,
                # reasoning content continues
137
                return DeltaMessage(reasoning=delta_text)
138
139
140
141
142
143
        elif self.start_token_id in delta_token_ids:
            if self.end_token_id in delta_token_ids:
                # start token in delta, end token in delta,
                # extract reasoning content
                start_index = delta_text.find(self.start_token)
                end_index = delta_text.find(self.end_token)
144
                reasoning = delta_text[start_index + len(self.start_token) : end_index]
145
                content = delta_text[end_index + len(self.end_token) :]
146
                return DeltaMessage(
147
                    reasoning=reasoning, content=content if content else None
148
149
150
151
                )
            else:
                # start token in delta, no end token in delta,
                # reasoning content continues
152
                return DeltaMessage(reasoning=delta_text)
153
154
155
156
        else:
            # not find thinking start token
            return DeltaMessage(content=delta_text)

157
    def extract_reasoning(
158
        self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest"
159
    ) -> tuple[str | None, str | None]:
160
161
        """
        Extract reasoning content from the model output.
162

163
164
165
166
167
168
        This is the base implementation that works for most models.
        Subclasses can override this method for specific behavior.
        """
        # Check if the start token is present in the model output, remove it
        # if it is present.
        model_output_parts = model_output.partition(self.start_token)
169
170
171
        model_output = (
            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
        )
172
173
174
175
176
177

        # For models that may not generate start token,
        # assume the reasoning content is always at the start.
        if self.end_token not in model_output:
            return model_output, None
        else:
178
            reasoning, _, content = model_output.partition(self.end_token)
179
180
            # If generation stops right after end-of-think, return null content
            final_content = content or None
181
            return reasoning, final_content
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

    def count_reasoning_tokens(self, token_ids: Sequence[int]) -> int:
        """Count tokens that fall within start/end thinking markers.

        Uses a depth counter so nested spans are handled safely and stray end
        tokens do not drive the counter negative.
        """
        count = 0
        depth = 0
        for token_id in token_ids:
            if token_id == self.start_token_id:
                depth += 1
                continue
            if token_id == self.end_token_id:
                if depth > 0:
                    depth -= 1
                continue
            if depth > 0:
                count += 1
        return count