basic_parsers.py 6.49 KB
Newer Older
1
2
3
4
5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from abc import abstractmethod
from collections.abc import Sequence
6
from typing import TYPE_CHECKING, Any
7

8
from vllm.entrypoints.openai.protocol import DeltaMessage
9
10
11
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.transformers_utils.tokenizer import AnyTokenizer

12
13
14
15
16
17
18
19
20
if TYPE_CHECKING:
    from vllm.entrypoints.openai.protocol import (
        ChatCompletionRequest,
        ResponsesRequest,
    )
else:
    ChatCompletionRequest = Any
    ResponsesRequest = Any

21
22
23
24

class BaseThinkingReasoningParser(ReasoningParser):
    """
    Base class for reasoning parsers that use thinking tokens.
25

26
27
28
    This class provides common functionality for parsers that use start and end
    tokens to delimit reasoning content (
        e.g., <think>...</think>, <seed:think>...</seed:think>).
29

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    Subclasses must implement the start and end tokens via abstract
    properties.
    """

    @property
    @abstractmethod
    def start_token(self) -> str:
        """The token that starts reasoning content."""
        raise NotImplementedError

    @property
    @abstractmethod
    def end_token(self) -> str:
        """The token that ends reasoning content."""
        raise NotImplementedError

46
47
    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
        super().__init__(tokenizer, *args, **kwargs)
48
49
50
51

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ReasoningParser "
52
53
                "constructor during construction."
            )
54
55

        if not self.start_token or not self.end_token:
56
            raise ValueError("start_token and end_token must be defined in subclasses")
57
58
59
60
61
62

        self.start_token_id = self.vocab.get(self.start_token)
        self.end_token_id = self.vocab.get(self.end_token)
        if self.start_token_id is None or self.end_token_id is None:
            raise RuntimeError(
                f"{self.__class__.__name__} reasoning parser could not locate "
63
64
                "think start/end tokens in the tokenizer!"
            )
65
66

    def is_reasoning_end(self, input_ids: list[int]) -> bool:
67
68
        end_token_id = self.end_token_id
        return any(input_id == end_token_id for input_id in reversed(input_ids))
69
70
71
72
73
74
75
76

    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        """
        Extract the content after the end tokens
        """
        if self.end_token_id not in input_ids[:-1]:
            return []
        else:
77
            return input_ids[input_ids.index(self.end_token_id) + 1 :]
78
79
80
81
82
83
84
85
86

    def extract_reasoning_content_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
87
    ) -> DeltaMessage | None:
88
89
90
91
92
93
        """
        Extract reasoning content from a delta message.
        Handles streaming output where previous + delta = current.
        Uses token IDs for faster processing.
        """
        # Skip single special tokens
94
95
96
        if len(delta_token_ids) == 1 and (
            delta_token_ids[0] in [self.start_token_id, self.end_token_id]
        ):
97
98
99
100
101
102
103
104
105
106
            return None

        # Check if start token is present in previous or delta.
        # Keep compatibility with models that don't generate start tokens.
        if self.start_token_id in previous_token_ids:
            if self.end_token_id in delta_token_ids:
                # start token in previous, end token in delta,
                # extract reasoning content
                end_index = delta_text.find(self.end_token)
                reasoning_content = delta_text[:end_index]
107
                content = delta_text[end_index + len(self.end_token) :]
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
                return DeltaMessage(
                    reasoning_content=reasoning_content,
                    content=content if content else None,
                )
            elif self.end_token_id in previous_token_ids:
                # start token in previous, end token in previous,
                # reasoning content continues
                return DeltaMessage(content=delta_text)
            else:
                # start token in previous, no end token in previous or delta,
                # reasoning content continues
                return DeltaMessage(reasoning_content=delta_text)
        elif self.start_token_id in delta_token_ids:
            if self.end_token_id in delta_token_ids:
                # start token in delta, end token in delta,
                # extract reasoning content
                start_index = delta_text.find(self.start_token)
                end_index = delta_text.find(self.end_token)
126
127
128
129
                reasoning_content = delta_text[
                    start_index + len(self.start_token) : end_index
                ]
                content = delta_text[end_index + len(self.end_token) :]
130
131
132
133
134
135
136
137
138
139
140
141
142
                return DeltaMessage(
                    reasoning_content=reasoning_content,
                    content=content if content else None,
                )
            else:
                # start token in delta, no end token in delta,
                # reasoning content continues
                return DeltaMessage(reasoning_content=delta_text)
        else:
            # not find thinking start token
            return DeltaMessage(content=delta_text)

    def extract_reasoning_content(
143
144
        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
    ) -> tuple[str | None, str | None]:
145
146
        """
        Extract reasoning content from the model output.
147

148
149
150
151
152
153
        This is the base implementation that works for most models.
        Subclasses can override this method for specific behavior.
        """
        # Check if the start token is present in the model output, remove it
        # if it is present.
        model_output_parts = model_output.partition(self.start_token)
154
155
156
        model_output = (
            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
        )
157
158
159
160
161
162

        # For models that may not generate start token,
        # assume the reasoning content is always at the start.
        if self.end_token not in model_output:
            return model_output, None
        else:
163
            reasoning_content, _, content = model_output.partition(self.end_token)
164
165
166
            # If generation stops right after end-of-think, return null content
            final_content = content or None
            return reasoning_content, final_content