hunyuan_a13b_reasoning_parser.py 9.6 KB
Newer Older
1
2
3
4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence
5
from typing import TYPE_CHECKING
6

7
import regex as re
8
9
from transformers import PreTrainedTokenizerBase

10
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
11
from vllm.logger import init_logger
12
from vllm.reasoning import ReasoningParser
13

14
15
16
17
if TYPE_CHECKING:
    from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
    from vllm.entrypoints.openai.responses.protocol import ResponsesRequest

18
19
20
21
22
23
24
25
26
logger = init_logger(__name__)


class HunyuanA13BReasoningParser(ReasoningParser):
    """
    Reasoning parser for Hunyuan A13B Model

    HunyuanReasoningParser

27
28
29
    This class implements a reasoning parser specifically designed
    for the Hunyuan A13B Model. It is responsible for parsing and
    extracting structured reasoning and answer segments from model
30
31
32
33
34
    outputs that follow a specific pattern.

    Key Features:
        - For non-stream output , Recognizes and extracts reasoning ("think")
         and answer ("answer") sections from text using regular expressions.
35
        - For stream process, it requires a token id sequences to change the
36
          reasoning state and other state so it maintains internal state to
37
38
39
40
41
42
43
44
          manage parsing across multiple token.


    think start: "<think>\n": [14023, 771, 397]
    think ends: "\n</think>\n<answer>\n": [198, 524, 27963, 397, 27, 9399, 397]
    response ends: "\n</answer>": [524, 9399, 29]
    """

45
46
    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
        super().__init__(tokenizer, *args, **kwargs)
47
48
49
50
51
52
53
54
        self.think_start_expr = r"<think>\n"
        self.think_end_expr = r"\n</think>\n"

        self.response_start_expr = r"\n</think>\n<answer>\n"
        self.response_end_expr = r"\n</answer>"

        self.full_match_reasoning_regex = re.compile(
            rf"(?:{self.think_start_expr}(.*?){self.response_start_expr})?(.*?){self.response_end_expr}",
55
56
            re.DOTALL,
        )
57
58

        self.half_match_reasoning_regex = re.compile(
59
60
            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)", re.DOTALL
        )
61
62
63
64
65
66

        self.think_start_ids = [14023, 771, 397]
        self.think_start_ids_fast = [14023, 771, 1363]
        self.response_start_ids = [198, 524, 27963, 397, 27, 9399, 397]
        self.response_start_ids_fast = [524, 27963, 397, 27, 9399, 397]
        self.response_end_ids = [198, 524, 9399, 29]
67
        self.fast_think_ids = [14023, 771, 1363, 524, 27963, 397, 27, 9399, 397]
68
69

        # when state change, send out all the buffered text in last state
70
71
        self.buffered_text: list[str] = []
        self.buffered_ids: list[int] = []
72
73
74
75
76
77
78
79
80

        self.current_state = "reasoning"
        self.all_states = ["reasoning", "response"]

        self.current_state = "idle"
        self.expected_sequence = self.think_start_ids
        # this sequence only for the think start, it has two way to start.
        self.expected_sequence_side = self.think_start_ids_fast
        self.sequence_index = 0
81
        self.token_buffer: list[int] = []
82
83
        self.text_buffer = ""

84
    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
85
86
        return self.current_state == "response"

87
88
89
90
91
92
93
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        # for hunyuan streaming reason parsing, the stream parse
        # will call first, and the same token will be called in
        # is_reasoning_end and extract_content_ids
        # this id is not part of content, so just return [] here.
        return []

94
    def extract_reasoning(
95
        self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest"
96
    ) -> tuple[str | None, str | None]:
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
        """Extract the reasoning content & content sections, respectively.
        If the sequence doesn't match what we expect, i.e., the model generates
        something else, all content is considered non-reasoning content.

        Args:
            model_output (str): Output of the model to be parsed.
            request (ChatCompletionRequest): Request being processed.

        Returns:
            tuple[Optional[str], Optional[str]]: Tuple pair containing the
            reasoning content and non-reasoning content.
        """

        re_match = self.full_match_reasoning_regex.findall(model_output)
        if re_match:
112
113
114
            reasoning, response_content = re_match[0]
            if len(reasoning) == 0:
                reasoning = None
115
116
            if len(response_content) == 0:
                response_content = None
117
            return reasoning, response_content
118
119
120
121

        fallback_regex = self.half_match_reasoning_regex
        fallback_match = fallback_regex.findall(model_output)
        if fallback_match:
122
            reasoning, response_content = fallback_match[0]
123
124

            if response_content.endswith(self.response_end_expr):
125
                response_content = response_content[: -len(self.response_end_expr)]
126

127
128
            if len(reasoning) == 0:
                reasoning = None
129
130
131
            if len(response_content) == 0:
                response_content = None

132
            return reasoning, response_content
133
134
135

        return None, model_output

136
137
138
    def _is_strict_increasing_subsequence(
        self, subsequence: Sequence[int], sequence: Sequence[int]
    ) -> bool:
139
140
141
142
143
144
145
146
147
        if not subsequence:
            return False

        sub_idx = 0
        for num in sequence:
            if sub_idx < len(subsequence) and num == subsequence[sub_idx]:
                sub_idx += 1
        return sub_idx == len(subsequence)

148
    def extract_reasoning_streaming(
149
150
151
152
153
154
155
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
156
    ) -> DeltaMessage | None:
157
158
159
160
161
162
        """Extract content using token ID sequence state machine"""
        # Define sequences
        think_start_sequence = self.think_start_ids
        response_start_sequence = self.response_start_ids
        response_end_sequence = self.response_end_ids

163
        assert len(delta_token_ids) == 1
164
165
166
167
168
        # Process each token in the delta
        token = delta_token_ids[0]

        def check_token_with_sequence(token):
            if self.current_state == "idle" or self.current_state == "think":
169
170
171
172
                return (
                    token == self.expected_sequence[self.sequence_index]
                    or token == self.expected_sequence_side[self.sequence_index]
                )
173
174
175
176
177
178
            else:
                return token == self.expected_sequence[self.sequence_index]

        def check_last_token(token):
            if self.current_state == "idle" or self.current_state == "think":
                # only return true if it's judge using a side sequence.
179
180
181
182
183
                if (
                    self.sequence_index - 1 < len(self.expected_sequence_side)
                    and token == self.expected_sequence_side[self.sequence_index - 1]
                ):
                    return self.sequence_index == len(self.expected_sequence_side)
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
                else:
                    return self.sequence_index == len(self.expected_sequence)
            else:
                return self.sequence_index == len(self.expected_sequence)

        # Check if token matches expected sequence
        token_in_state_seq = check_token_with_sequence(token)

        if token_in_state_seq:
            # Store matching token
            self.token_buffer.append(token)
            self.text_buffer += delta_text
            self.sequence_index += 1
            ## state change from idle->think->response->idle

            # Check if sequence fully matched
            if check_last_token(token):
                # State transition
                if self.current_state == "idle":
                    self.current_state = "think"
                    self.expected_sequence = response_start_sequence
                    self.expected_sequence_side = self.response_start_ids_fast
                elif self.current_state == "think":
                    self.current_state = "response"
                    self.expected_sequence = response_end_sequence
                elif self.current_state == "response":
                    self.current_state = "idle"
                    self.expected_sequence = think_start_sequence
                    self.expected_sequence_side = self.think_start_ids_fast

                # Reset matching state
                self.sequence_index = 0
                self.token_buffer = []
                self.text_buffer = ""
                # Do not send content for state transition texts.
        else:
            # Sequence broken - handle buffered content
            if self.token_buffer and len(self.token_buffer) > 0:
                # Send buffered tokens
                buffered_content = self.text_buffer + delta_text
                # Reset matching state
                self.sequence_index = 0
                self.token_buffer = []
                self.text_buffer = ""

                # Return content based on current state
                if self.current_state == "think":
231
                    return DeltaMessage(reasoning=buffered_content, content=None)
232
                else:
233
                    return DeltaMessage(reasoning=None, content=buffered_content)
234
235
236
            else:
                # No buffered content, send normally
                if self.current_state == "think":
237
                    return DeltaMessage(reasoning=delta_text, content=None)
238
                else:
239
                    return DeltaMessage(reasoning=None, content=delta_text)
240
241
242

        # If no content to send in this delta
        return None