llama4_pythonic_tool_parser.py 8.03 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
import ast
from collections.abc import Sequence

7
import regex as re
8
9
from transformers import PreTrainedTokenizerBase

10
import vllm.envs as envs
11
from vllm.entrypoints.openai.chat_completion.protocol import (
12
    ChatCompletionRequest,
13
14
)
from vllm.entrypoints.openai.engine.protocol import (
15
16
17
    DeltaMessage,
    ExtractedToolCallInformation,
)
18
19
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
20
    Tool,
21
22
    ToolParser,
)
23
24
25
26
27
28
from vllm.tool_parsers.utils import (
    UnexpectedAstError,
    compute_tool_delta,
    handle_single_tool,
    make_valid_python,
)
29
30
31
32
33
34
35
36
37

logger = init_logger(__name__)


class Llama4PythonicToolParser(ToolParser):
    """
    Toolcall parser for Llama4 that produce tool calls in a pythonic style
    Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic
    """
38

39
40
41
42
43
44
45
46
47
    # TODO(mdepinet): Possible future improvements:
    #   1. Support text + tools separated by either <|python_tag|> or \n\n
    #   2. Support tools outside of a list (or separated by a semicolon).
    #      This depends on item 1 for consistent streaming.
    # Neither of these are necessary for e.g. ToolACE, but both would help make
    # Llama3.2 models more reliable.

    TOOL_CALL_REGEX = re.compile(
        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
48
49
        re.DOTALL,
    )
50

51
52
53
54
55
56
    def __init__(
        self,
        tokenizer: PreTrainedTokenizerBase,
        tools: list[Tool] | None = None,
    ):
        super().__init__(tokenizer, tools)
57
58
59
60
61
62
63
64
65
66
67

    # Rename for readability. This is NOT a tool id.
    @property
    def current_tool_index(self) -> int:
        return self.current_tool_id

    @current_tool_index.setter
    def current_tool_index(self, value: int) -> None:
        self.current_tool_id = value

    def extract_tool_calls(
68
69
        self, model_output: str, request: ChatCompletionRequest
    ) -> ExtractedToolCallInformation:
70
71
72
73
74
75
76
        """
        Extract the tool calls from a complete model response.
        """

        # remove <|python_start|> and <|python_end|>
        # as Llama 4 model sometime will output those tokens
        if model_output.startswith("<|python_start|>"):
77
            model_output = model_output[len("<|python_start|>") :]
78
            model_output = model_output.replace("<|python_end|>", "")
79
80
81

        is_tool_call_pattern = False
        try:
82
83
84
85
86
87
            is_tool_call_pattern = (
                self.TOOL_CALL_REGEX.match(
                    model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
                )
                is not None
            )
88
        except TimeoutError:
89
90
91
92
            logger.warning("Regex timeout occurred when matching tool call pattern.")
            logger.debug(
                "Regex timeout occurred when matching user input: %s", model_output
            )
93
94

        if not is_tool_call_pattern:
95
96
97
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )
98
99
100
101
102

        try:
            module = ast.parse(model_output)
            parsed = getattr(module.body[0], "value", None)
            if isinstance(parsed, ast.List) and all(
103
104
                isinstance(e, ast.Call) for e in parsed.elts
            ):
105
106
107
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=[
108
                        handle_single_tool(e)  # type: ignore
109
110
                        for e in parsed.elts
                    ],
111
112
                    content=None,
                )
113
            else:
114
                raise UnexpectedAstError("Tool output must be a list of function calls")
115
116
117
        except Exception:
            logger.exception("Error in extracting tool call from response.")
            # Treat as regular text
118
119
120
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )
121
122
123
124
125
126
127
128
129
130

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
131
    ) -> DeltaMessage | None:
132
        if not current_text.startswith("[") and not current_text.startswith(
133
134
            "<|python_start|>"
        ):
135
136
137
138
139
            return DeltaMessage(content=delta_text)

        try:
            # remove <|python_start|> and <|python_end|>
            if current_text.startswith("<|python_start|>"):
140
                current_text = current_text[len("<|python_start|>") :]
141
            if current_text.endswith("<|python_end|>"):
142
                current_text = current_text[: current_text.rfind("<|python_end|>")]
143
            valid_and_added_text = make_valid_python(current_text)
144
145
146
147
148
149
150
            if valid_and_added_text is None:
                return None
            valid_text, added_text = valid_and_added_text

            module = ast.parse(valid_text)
            parsed = getattr(module.body[0], "value", None)
            if not isinstance(parsed, ast.List) or not all(
151
152
                isinstance(e, ast.Call) for e in parsed.elts
            ):
153
                raise UnexpectedAstError("Tool output must be a list of function calls")
154
            tool_calls = [
155
                handle_single_tool(e)  # type: ignore
156
157
158
159
160
161
162
163
164
165
166
167
                for e in parsed.elts
            ]

            tool_deltas = []
            for index, new_call in enumerate(tool_calls):
                if index < self.current_tool_index:
                    continue

                self.current_tool_index = index
                if len(self.streamed_args_for_tool) == index:
                    self.streamed_args_for_tool.append("")

168
169
170
                new_call_complete = (
                    index < len(tool_calls) - 1 or ")]" not in added_text
                )
171
172
173
                if new_call_complete:
                    self.current_tool_index += 1

174
                withheld_suffix = added_text[:-2] if not new_call_complete else ""
175
176
177
178
179
180
                if not new_call_complete and added_text[-2] == ")":
                    # Function call is incomplete. Withhold the closing bracket.
                    withheld_suffix = withheld_suffix + "}"
                # Strings get single quotes in the model-produced string.
                # JSON requires double quotes.
                withheld_suffix = withheld_suffix.replace("'", '"')
181
                delta = compute_tool_delta(
182
183
                    self.streamed_args_for_tool[index], new_call, index, withheld_suffix
                )
184
185
186

                if delta is not None:
                    tool_deltas.append(delta)
187
188
189
190
191
192
193
194
195
196
197
                    if (
                        delta.function is not None
                        and delta.function.arguments is not None
                    ):
                        self.streamed_args_for_tool[index] += delta.function.arguments

            # HACK: serving_chat.py inspects the internal state of tool parsers
            # when determining its final streaming delta, automatically
            # adding autocompleted JSON.
            # These two lines avoid that nonsense while ensuring finish_reason
            # is set to tool_calls when at least one tool is called.
198
199
200
201
202
203
204
205
            if tool_deltas and not self.prev_tool_call_arr:
                self.prev_tool_call_arr = [{"arguments": {}}]

            if tool_deltas:
                return DeltaMessage(tool_calls=tool_deltas)
            elif not added_text and self.current_tool_id > 0:
                # Return an empty DeltaMessage once the tool calls are all done
                # so that finish_reason gets set.
206
                return DeltaMessage(content="")
207
208
209
210
211
            else:
                return None
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            logger.debug(
212
213
                "Skipping chunk as a result of tool streaming extraction error"
            )
214
            return None