hermes_tool_parser.py 11.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import json
5
from collections.abc import Sequence
6

7
import regex as re
8

9
from vllm.entrypoints.chat_utils import make_tool_call_id
10
from vllm.entrypoints.openai.chat_completion.protocol import (
11
    ChatCompletionRequest,
12
13
)
from vllm.entrypoints.openai.engine.protocol import (
14
15
16
17
18
19
20
    DeltaFunctionCall,
    DeltaMessage,
    DeltaToolCall,
    ExtractedToolCallInformation,
    FunctionCall,
    ToolCall,
)
21
from vllm.logger import init_logger
22
from vllm.tokenizers import TokenizerLike
23
from vllm.tool_parsers.abstract_tool_parser import (
24
    Tool,
25
26
    ToolParser,
)
27
from vllm.utils.mistral import is_mistral_tokenizer
28
29
30
31

logger = init_logger(__name__)


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def _partial_tag_overlap(text: str, tag: str) -> int:
    """Length of the longest prefix of `tag` that matches a suffix of `text`.

    E.g. text ending in "<tool_" returns 6 when tag is "<tool_call>".
    Returns 0 if there is no overlap.
    """
    max_check = min(len(tag) - 1, len(text))
    for k in range(max_check, 0, -1):
        if text.endswith(tag[:k]):
            return k
    return 0


def _is_valid_json(text: str) -> bool:
    try:
        json.loads(text)
        return True
    except (json.JSONDecodeError, ValueError):
        return False


53
class Hermes2ProToolParser(ToolParser):
54
55
    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
        super().__init__(tokenizer, tools)
56

57
        if is_mistral_tokenizer(tokenizer):
58
            logger.error("Detected Mistral tokenizer when using a Hermes model")
59
            self.model_tokenizer = tokenizer.tokenizer
60
61
62
63
64

        self.tool_call_start_token: str = "<tool_call>"
        self.tool_call_end_token: str = "</tool_call>"

        self.tool_call_regex = re.compile(
65
66
            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL
        )
67
        self.scratch_pad_regex = re.compile(
68
69
            r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL
        )
70
71
72
73

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
74
75
                "constructor during construction."
            )
76

77
78
        # Streaming state: what has been sent to the client.
        self._sent_content_idx: int = 0
79

80
    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
81
        request = super().adjust_request(request)
82
        if request.tools and request.tool_choice != "none":
83
84
85
86
87
88
            # do not skip special tokens because the tool_call tokens are
            # marked "special" in some models. Since they are skipped
            # prior to the call to the tool parser, it breaks tool calling.
            request.skip_special_tokens = False
        return request

89
90
91
92
93
    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
94
95
        # sanity check; avoid unnecessary processing
        if self.tool_call_start_token not in model_output:
96
97
98
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )
99
100
101
102
103
104
105

        else:
            try:
                # there are two possible captures - between tags, or between a
                # tag and end-of-string so the result of
                # findall is an array of tuples where one is a function call and
                # the other is None
106
                function_call_tuples = self.tool_call_regex.findall(model_output)
107
108
109
110
111
112
113
114
115
116
117
118
119

                # load the JSON, and then use it to build the Function and
                # Tool Call
                raw_function_calls = [
                    json.loads(match[0] if match[0] else match[1])
                    for match in function_call_tuples
                ]
                tool_calls = [
                    ToolCall(
                        type="function",
                        function=FunctionCall(
                            name=function_call["name"],
                            # function call args are JSON but as a string
120
121
122
123
124
                            arguments=json.dumps(
                                function_call["arguments"], ensure_ascii=False
                            ),
                        ),
                    )
125
126
127
                    for function_call in raw_function_calls
                ]

128
                content = model_output[: model_output.find(self.tool_call_start_token)]
129
130
131
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=tool_calls,
132
133
                    content=content if content else None,
                )
134

135
            except Exception:
136
137
138
139
                logger.exception("Error in extracting tool call from response.")
                return ExtractedToolCallInformation(
                    tools_called=False, tool_calls=[], content=model_output
                )
140

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
    def _extract_content(self, current_text: str) -> str | None:
        """Return unsent non-tool-call text, or None.

        Holds back any suffix that could be a partial <tool_call> tag.
        """
        if self.tool_call_start_token not in current_text:
            overlap_length = _partial_tag_overlap(
                current_text, self.tool_call_start_token
            )
            sendable_idx = len(current_text) - overlap_length
        else:
            sendable_idx = current_text.index(self.tool_call_start_token)

        if sendable_idx > self._sent_content_idx:
            content = current_text[self._sent_content_idx : sendable_idx]
            self._sent_content_idx = sendable_idx
            return content
        return None

    def _extract_tool_call_jsons(self, text: str) -> list[tuple[str, bool]]:
        """Extract (json_text, is_complete) for each <tool_call> region."""
        results: list[tuple[str, bool]] = []
        pos = 0
        while True:
            start = text.find(self.tool_call_start_token, pos)
            if start == -1:
                break
            json_start = start + len(self.tool_call_start_token)
            json_end = text.find(self.tool_call_end_token, json_start)
            if json_end != -1:
                results.append((text[json_start:json_end].strip(), True))
                pos = json_end + len(self.tool_call_end_token)
            else:
                raw = text[json_start:]
                # Strip partial </tool_call> suffix if present.
                overlap = _partial_tag_overlap(raw, self.tool_call_end_token)
                if overlap:
                    raw = raw[:-overlap]
                tc_json = raw.strip()
                # Valid JSON without closing tag = complete body,
                # tag tokens just haven't arrived yet.
                is_complete = _is_valid_json(tc_json) if tc_json else False
                results.append((tc_json, is_complete))
                break
        return results

    @staticmethod
    def _extract_tool_name(tc_json: str) -> str | None:
        """Extract tool name, or None if the name isn't complete yet."""
        match = re.search(r'"name"\s*:\s*"([^"]+)"', tc_json)
        return match.group(1) if match else None

    @staticmethod
    def _extract_tool_args(tc_json: str, is_complete: bool) -> str | None:
        """Extract tool arguments from the tool call JSON.

        Given {"name": "f", "arguments": {"x": 1}}, returns '{"x": 1}'.
        When is_complete, strips the trailing '}' that closes the outer
        object (not the arguments). For partial JSON, returns as-is.
        """
        match = re.search(r'"arguments"\s*:\s*', tc_json)
        if not match:
            return None
        raw = tc_json[match.end() :]
        if is_complete:
            raw = raw.rstrip()
            if raw.endswith("}"):
                raw = raw[:-1].rstrip()
        return raw

    def _compute_args_diff(
        self, index: int, tc_json: str, is_complete: bool
    ) -> str | None:
        """Return new argument text not yet sent for tool `index`, or None."""
        args = self._extract_tool_args(tc_json, is_complete)
        if args is None or len(args) <= len(self.streamed_args_for_tool[index]):
            return None
        diff = args[len(self.streamed_args_for_tool[index]) :]
        self.streamed_args_for_tool[index] = args
        self.prev_tool_call_arr[index]["arguments"] = args
        return diff

223
224
225
226
227
228
229
230
    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
231
        request: ChatCompletionRequest,
232
    ) -> DeltaMessage | None:
233
        """Incrementally stream tool call deltas from accumulated output.
234

235
236
237
        On each invocation, re-parses the full ``current_text`` to find
        ``<tool_call>`` regions, then diffs against previously sent state
        to emit only new content, tool names, or argument fragments.
238

239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
        Returns a ``DeltaMessage`` containing either plain content (for
        text preceding any tool call) or one or more ``DeltaToolCall``
        entries, or ``None`` if there is nothing new to send yet."""
        try:
            # Extract any content before tool calls.
            content = self._extract_content(current_text)
            tool_call_jsons = self._extract_tool_call_jsons(current_text)
            tool_call_deltas: list[DeltaToolCall] = []

            for i, (tc_json, is_complete) in enumerate(tool_call_jsons):
                if i >= len(self.prev_tool_call_arr):
                    self.prev_tool_call_arr.append({})
                    self.streamed_args_for_tool.append("")

                # Stream back tool name.
                if "name" not in self.prev_tool_call_arr[i]:
                    name = self._extract_tool_name(tc_json)
                    if not name:
                        # Can't skip to tool i+1 if i isn't ready
                        break
                    self.prev_tool_call_arr[i]["name"] = name
                    tool_call_deltas.append(
261
                        DeltaToolCall(
262
263
264
265
266
267
                            index=i,
                            type="function",
                            id=make_tool_call_id(),
                            function=DeltaFunctionCall(name=name).model_dump(
                                exclude_none=True
                            ),
268
                        )
269
270
271
272
273
274
                    )

                # Stream back new tool args by diffing against what was sent.
                args_diff = self._compute_args_diff(i, tc_json, is_complete)
                if args_diff:
                    tool_call_deltas.append(
275
                        DeltaToolCall(
276
277
                            index=i,
                            function=DeltaFunctionCall(arguments=args_diff).model_dump(
278
279
280
                                exclude_none=True
                            ),
                        )
281
                    )
282

283
284
285
286
287
            if content or tool_call_deltas:
                return DeltaMessage(
                    content=content,
                    tool_calls=tool_call_deltas,
                )
288

289
            return None
290

291
292
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
293
            return None