hunyuan_a13b_reasoning_parser.py 9.42 KB
Newer Older
1
2
3
4
5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence

6
import regex as re
7
8
from transformers import PreTrainedTokenizerBase

9
10
11
12
from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
13
from vllm.logger import init_logger
14
from vllm.reasoning import ReasoningParser
15
16
17
18
19
20
21
22
23
24

logger = init_logger(__name__)


class HunyuanA13BReasoningParser(ReasoningParser):
    """
    Reasoning parser for Hunyuan A13B Model

    HunyuanReasoningParser

25
26
27
    This class implements a reasoning parser specifically designed
    for the Hunyuan A13B Model. It is responsible for parsing and
    extracting structured reasoning and answer segments from model
28
29
30
31
32
    outputs that follow a specific pattern.

    Key Features:
        - For non-stream output , Recognizes and extracts reasoning ("think")
         and answer ("answer") sections from text using regular expressions.
33
        - For stream process, it requires a token id sequences to change the
34
          reasoning state and other state so it maintains internal state to
35
36
37
38
39
40
41
42
          manage parsing across multiple token.


    think start: "<think>\n": [14023, 771, 397]
    think ends: "\n</think>\n<answer>\n": [198, 524, 27963, 397, 27, 9399, 397]
    response ends: "\n</answer>": [524, 9399, 29]
    """

43
44
    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
        super().__init__(tokenizer, *args, **kwargs)
45
46
47
48
49
50
51
52
        self.think_start_expr = r"<think>\n"
        self.think_end_expr = r"\n</think>\n"

        self.response_start_expr = r"\n</think>\n<answer>\n"
        self.response_end_expr = r"\n</answer>"

        self.full_match_reasoning_regex = re.compile(
            rf"(?:{self.think_start_expr}(.*?){self.response_start_expr})?(.*?){self.response_end_expr}",
53
54
            re.DOTALL,
        )
55
56

        self.half_match_reasoning_regex = re.compile(
57
58
            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)", re.DOTALL
        )
59
60
61
62
63
64

        self.think_start_ids = [14023, 771, 397]
        self.think_start_ids_fast = [14023, 771, 1363]
        self.response_start_ids = [198, 524, 27963, 397, 27, 9399, 397]
        self.response_start_ids_fast = [524, 27963, 397, 27, 9399, 397]
        self.response_end_ids = [198, 524, 9399, 29]
65
        self.fast_think_ids = [14023, 771, 1363, 524, 27963, 397, 27, 9399, 397]
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

        # when state change, send out all the buffered text in last state
        self.buffered_text = []
        self.buffered_ids = []

        self.current_state = "reasoning"
        self.all_states = ["reasoning", "response"]

        self.current_state = "idle"
        self.expected_sequence = self.think_start_ids
        # this sequence only for the think start, it has two way to start.
        self.expected_sequence_side = self.think_start_ids_fast
        self.sequence_index = 0
        self.token_buffer = []
        self.text_buffer = ""

    def is_reasoning_end(self, input_ids: list[int]) -> bool:
        return self.current_state == "response"

85
86
87
88
89
90
91
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        # for hunyuan streaming reason parsing, the stream parse
        # will call first, and the same token will be called in
        # is_reasoning_end and extract_content_ids
        # this id is not part of content, so just return [] here.
        return []

92
    def extract_reasoning(
93
        self, model_output: str, request: ChatCompletionRequest
94
    ) -> tuple[str | None, str | None]:
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
        """Extract the reasoning content & content sections, respectively.
        If the sequence doesn't match what we expect, i.e., the model generates
        something else, all content is considered non-reasoning content.

        Args:
            model_output (str): Output of the model to be parsed.
            request (ChatCompletionRequest): Request being processed.

        Returns:
            tuple[Optional[str], Optional[str]]: Tuple pair containing the
            reasoning content and non-reasoning content.
        """

        re_match = self.full_match_reasoning_regex.findall(model_output)
        if re_match:
110
111
112
            reasoning, response_content = re_match[0]
            if len(reasoning) == 0:
                reasoning = None
113
114
            if len(response_content) == 0:
                response_content = None
115
            return reasoning, response_content
116
117
118
119

        fallback_regex = self.half_match_reasoning_regex
        fallback_match = fallback_regex.findall(model_output)
        if fallback_match:
120
            reasoning, response_content = fallback_match[0]
121
122

            if response_content.endswith(self.response_end_expr):
123
                response_content = response_content[: -len(self.response_end_expr)]
124

125
126
            if len(reasoning) == 0:
                reasoning = None
127
128
129
            if len(response_content) == 0:
                response_content = None

130
            return reasoning, response_content
131
132
133

        return None, model_output

134
135
136
    def _is_strict_increasing_subsequence(
        self, subsequence: Sequence[int], sequence: Sequence[int]
    ) -> bool:
137
138
139
140
141
142
143
144
145
        if not subsequence:
            return False

        sub_idx = 0
        for num in sequence:
            if sub_idx < len(subsequence) and num == subsequence[sub_idx]:
                sub_idx += 1
        return sub_idx == len(subsequence)

146
    def extract_reasoning_streaming(
147
148
149
150
151
152
153
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
154
    ) -> DeltaMessage | None:
155
156
157
158
159
160
        """Extract content using token ID sequence state machine"""
        # Define sequences
        think_start_sequence = self.think_start_ids
        response_start_sequence = self.response_start_ids
        response_end_sequence = self.response_end_ids

161
        assert len(delta_token_ids) == 1
162
163
164
165
166
        # Process each token in the delta
        token = delta_token_ids[0]

        def check_token_with_sequence(token):
            if self.current_state == "idle" or self.current_state == "think":
167
168
169
170
                return (
                    token == self.expected_sequence[self.sequence_index]
                    or token == self.expected_sequence_side[self.sequence_index]
                )
171
172
173
174
175
176
            else:
                return token == self.expected_sequence[self.sequence_index]

        def check_last_token(token):
            if self.current_state == "idle" or self.current_state == "think":
                # only return true if it's judge using a side sequence.
177
178
179
180
181
                if (
                    self.sequence_index - 1 < len(self.expected_sequence_side)
                    and token == self.expected_sequence_side[self.sequence_index - 1]
                ):
                    return self.sequence_index == len(self.expected_sequence_side)
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
                else:
                    return self.sequence_index == len(self.expected_sequence)
            else:
                return self.sequence_index == len(self.expected_sequence)

        # Check if token matches expected sequence
        token_in_state_seq = check_token_with_sequence(token)

        if token_in_state_seq:
            # Store matching token
            self.token_buffer.append(token)
            self.text_buffer += delta_text
            self.sequence_index += 1
            ## state change from idle->think->response->idle

            # Check if sequence fully matched
            if check_last_token(token):
                # State transition
                if self.current_state == "idle":
                    self.current_state = "think"
                    self.expected_sequence = response_start_sequence
                    self.expected_sequence_side = self.response_start_ids_fast
                elif self.current_state == "think":
                    self.current_state = "response"
                    self.expected_sequence = response_end_sequence
                elif self.current_state == "response":
                    self.current_state = "idle"
                    self.expected_sequence = think_start_sequence
                    self.expected_sequence_side = self.think_start_ids_fast

                # Reset matching state
                self.sequence_index = 0
                self.token_buffer = []
                self.text_buffer = ""
                # Do not send content for state transition texts.
        else:
            # Sequence broken - handle buffered content
            if self.token_buffer and len(self.token_buffer) > 0:
                # Send buffered tokens
                buffered_content = self.text_buffer + delta_text
                # Reset matching state
                self.sequence_index = 0
                self.token_buffer = []
                self.text_buffer = ""

                # Return content based on current state
                if self.current_state == "think":
229
                    return DeltaMessage(reasoning=buffered_content, content=None)
230
                else:
231
                    return DeltaMessage(reasoning=None, content=buffered_content)
232
233
234
            else:
                # No buffered content, send normally
                if self.current_state == "think":
235
                    return DeltaMessage(reasoning=delta_text, content=None)
236
                else:
237
                    return DeltaMessage(reasoning=None, content=delta_text)
238
239
240

        # If no content to send in this delta
        return None