mistral_tool_parser.py 38.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
from __future__ import annotations

6
import json
7
from collections.abc import Sequence
8
from dataclasses import dataclass
9
from enum import Enum, auto
10
11
from random import choices
from string import ascii_letters, digits
12
from typing import TYPE_CHECKING, Any
13

14
import ijson
15
import regex as re
16
17
18
19
20
21
22
23
24
25
26
27
from mistral_common.protocol.instruct.tool_calls import (
    NamedToolChoice as MistralNamedToolChoice,
)
from mistral_common.protocol.instruct.tool_calls import (
    Tool as MistralTool,
)
from mistral_common.protocol.instruct.tool_calls import (
    ToolChoice as MistralToolChoice,
)
from mistral_common.protocol.instruct.tool_calls import (
    ToolChoiceEnum as MistralToolChoiceEnum,
)
28
from pydantic import Field
29

30
from vllm.entrypoints.openai.chat_completion.protocol import (
31
    ChatCompletionRequest,
32
33
)
from vllm.entrypoints.openai.engine.protocol import (
34
35
36
37
38
39
40
    DeltaFunctionCall,
    DeltaMessage,
    DeltaToolCall,
    ExtractedToolCallInformation,
    FunctionCall,
    ToolCall,
)
41
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
42
from vllm.logger import init_logger
43
from vllm.reasoning.mistral_reasoning_parser import MistralReasoningParser
44
from vllm.sampling_params import StructuredOutputsParams
45
from vllm.tokenizers import TokenizerLike
46
from vllm.tokenizers.mistral import MistralTokenizer, adapt_inplace_to_mistral_tool
47
from vllm.tool_parsers.abstract_tool_parser import (
48
    Tool,
49
50
    ToolParser,
)
51
from vllm.utils.mistral import is_mistral_tokenizer
52

53
54
55
if TYPE_CHECKING:
    from vllm.reasoning import ReasoningParser

56
57
logger = init_logger(__name__)

58
59
ALPHANUMERIC = ascii_letters + digits

60
61
_DEFAULT_JSON_SCHEMA = {"anyOf": [{"type": "object"}, {"type": "array"}]}

62

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class StreamingState(Enum):
    """Enum for tracking the current streaming parsing state."""

    WAITING_FOR_TOOL_START = auto()
    WAITING_FOR_TOOL_KEY = (
        auto()
    )  # waiting for the "name" or "arguments" key to be complete
    PARSING_NAME = auto()
    PARSING_NAME_COMPLETED = auto()
    WAITING_FOR_ARGUMENTS_START = auto()
    PARSING_ARGUMENTS = auto()
    PARSING_ARGUMENTS_COMPLETED = auto()
    TOOL_COMPLETE = auto()
    ALL_TOOLS_COMPLETE = auto()


79
class MistralToolCall(ToolCall):
80
    id: str = Field(default_factory=lambda: MistralToolCall.generate_random_id())
81
82
83

    @staticmethod
    def generate_random_id():
84
        # Mistral Tool Call Ids must be alphanumeric with a length of 9.
85
86
87
        # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
        return "".join(choices(ALPHANUMERIC, k=9))

88
89
90
91
    @staticmethod
    def is_valid_id(id: str) -> bool:
        return id.isalnum() and len(id) == 9

92

93
def _is_pre_v11_tokeniser(model_tokenizer: TokenizerLike) -> bool:
94
95
96
97
98
99
    if is_mistral_tokenizer(model_tokenizer):
        return model_tokenizer.version < 11
    # For HF tokenizers, check if [ARGS] token exists in vocab
    # which indicates a v11+ equivalent tokenizer
    vocab: dict[str, int] = getattr(model_tokenizer, "get_vocab", lambda: {})()
    return "[ARGS]" not in vocab
100
101


102
103
104
105
@dataclass
class MistralStreamingResult:
    r"""Encapsulates the mutable state returned from
    `MistralToolParser.extract_maybe_reasoning_and_tool_streaming`.
106
107
    """

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    delta_message: DeltaMessage | None
    reasoning_ended: bool
    tools_called: bool
    current_text: str
    current_token_ids: list[int]


class MistralToolParser(ToolParser):
    r"""Tool call parser for Mistral models, intended for use with either:

    - `mistral_common <https://github.com/mistralai/mistral-common/>`_
      (recommended)
    - the `examples/tool_chat_template_mistral.jinja` template.

    Used when `--enable-auto-tool-choice --tool-call-parser mistral` are all
    set.
124
125
    """

126
127
    IS_MISTRAL_TOOL_PARSER = True  # used by vllm.utils.mistral

128
129
130
    # Used to generate correct grammar in `adjust_request`
    model_can_reason: bool = False

131
132
    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
        super().__init__(tokenizer, tools)
133

134
        if not is_mistral_tokenizer(self.model_tokenizer):
135
            logger.info("Non-Mistral tokenizer detected when using a Mistral model...")
136
137
138

        # initialize properties used for state when parsing tool calls in
        # streaming mode
139
        self.prev_tool_call_arr: list[dict[str, Any]] = []
140
        self.current_tool_id: int = -1
141
142
143
144
145
146
        self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START

        # For streaming pre v11 tokenizer tool calls
        self.current_tool_name: str | None = None
        self.current_tool_mistral_id: str | None = None
        self.starting_new_tool = False
147
148
        self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer)
        if self._is_pre_v11:
149
150
151
152
            self.parse_coro = ijson.parse_coro(
                self.update_stream_state_pre_v11_tokenizer()
            )

153
        self.bot_token = "[TOOL_CALLS]"
154
        self.bot_token_id = self.vocab.get(self.bot_token)
155
        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
156

157
        if self.bot_token_id is None:
158
159
            raise RuntimeError(
                "Mistral Tool Parser could not locate the tool call token in "
160
161
162
                "the tokenizer!"
            )

163
164
165
    def adjust_request(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> ChatCompletionRequest | ResponsesRequest:
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
        so_non_supported_attributes = [
            "regex",
            "choice",
            "grammar",
            # whitespace_pattern is not a constraint type but an option;
            # Mistral grammar factory does not support it.
            "whitespace_pattern",
            "structural_tag",
        ]
        any_so_non_supported_active = request.structured_outputs is not None and any(
            getattr(request.structured_outputs, attribute) is not None
            for attribute in so_non_supported_attributes
        )
        response_format_non_supported_active = (
            isinstance(request, ResponsesRequest)
            or request.response_format is not None
            and request.response_format.type == "structural_tag"
        )

185
        if (
186
            not is_mistral_tokenizer(self.model_tokenizer)
187
188
189
190
            or isinstance(request, ResponsesRequest)
            or not self.model_tokenizer.supports_grammar
            or any_so_non_supported_active
            or response_format_non_supported_active
191
        ):
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
            request = super().adjust_request(request)
            if request.tools and request.tool_choice != "none":
                # Do not skip special tokens when using chat template
                # with Mistral parser as TOOL_CALL token is needed
                # for tool detection.
                # Note: we don't want skip_special_tokens=False
                # with MistralTokenizer as it is incompatible
                request.skip_special_tokens = False
            return request

        json_schema: dict[str, Any] | None = None
        if request.structured_outputs is not None:
            if request.structured_outputs.json_object is not None:
                json_schema = _DEFAULT_JSON_SCHEMA
            elif request.structured_outputs.json is not None:
                if isinstance(request.structured_outputs.json, str):
                    json_schema = json.loads(request.structured_outputs.json)
                else:
                    json_schema = request.structured_outputs.json
            else:
                raise ValueError(
                    "Unsupported request.structured_outputs for MistralToolParser. "
                    "Only `json` and `json_object` are supported."
                )
        elif (
            request.response_format is not None
            and request.response_format.type != "text"
        ):
            if request.response_format.type == "json_object":
                json_schema = _DEFAULT_JSON_SCHEMA
            elif request.response_format.type == "json_schema":
                if request.response_format.json_schema is not None:
                    json_schema = request.response_format.json_schema.json_schema
                else:
                    json_schema = _DEFAULT_JSON_SCHEMA
            else:
                raise ValueError(
                    "MistralToolParser only accepts `text`, `json_object` or "
                    f"`json_schema`, got {request.response_format=}"
                )
            # Structured Outputs will be defined.
            request.response_format = None

        grammar_factory = self.model_tokenizer.grammar_factory

        # TODO: Once unified parser, improve this.
        # The issue is figuring out when a model is a reasoning one or not.
        template = grammar_factory.select_jinja_template(
            reasoning=self.model_can_reason
        )

243
        mistral_tools = (
244
            [
245
246
247
                MistralTool.model_validate(
                    adapt_inplace_to_mistral_tool(tool.model_dump())
                )
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
                for tool in request.tools
            ]
            if request.tools is not None
            else None
        )

        tool_choice: MistralToolChoice
        match request.tool_choice:
            case "none" | "auto" | "required":
                tool_choice = MistralToolChoiceEnum(request.tool_choice)
            case None:
                tool_choice = MistralToolChoiceEnum.auto
            # _ == Named tool choice
            case _:
                tool_choice = MistralNamedToolChoice.model_validate(
                    {
                        "type": "function",
                        "function": {"name": request.tool_choice.function.name},
                    }
                )

        # Rendering grammar is cached in mistral-common given tools, template and mode.
        match tool_choice, json_schema is not None:
            case MistralToolChoiceEnum.none, True:
                lark_grammar = grammar_factory.get_lark_for_json_schema(
                    template=template, json_schema=json_schema
                )
            case _, _:
                lark_grammar = grammar_factory.get_lark_from_jinja(
                    template=template,
                    mode=tool_choice,
279
                    tools=mistral_tools,
280
281
282
283
284
285
                    json_schema=json_schema,
                    parallel_tool_calls=request.parallel_tool_calls,
                    json_only=False,
                )

        request.structured_outputs = StructuredOutputsParams(grammar=lark_grammar)
286
        request._grammar_from_tool_parser = True
287
288
        return request

289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
    def extract_maybe_reasoning_and_tool_streaming(
        self,
        *,
        reasoning_parser: ReasoningParser | None,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: list[int],
        current_token_ids: list[int],
        output_token_ids: Sequence[int],
        reasoning_ended: bool,
        prompt_is_reasoning_end: bool | None,
        request: ChatCompletionRequest,
    ) -> MistralStreamingResult:
        r"""Streaming extraction with reasoning followed by tool-call parsing.

        This method encapsulates the combined reasoning extraction and
        tool-call streaming logic so that the serving layer only needs a
        thin routing branch.

        The flow is:

        1. If a *reasoning_parser* is present and reasoning has **not** ended,
           extract reasoning tokens.  Pre-v15 models may have pre-filled
           `[THINK]...[/THINK]` in system prompts, so we skip the
           prompt-level reasoning-end check for those.
        2. Once reasoning ends (or if there is no reasoning parser), delegate
           to `extract_tool_calls_streaming` and track whether tools were
           called.

        Args:
            reasoning_parser: Optional reasoning parser instance.
            previous_text: Accumulated text from prior chunks.
            current_text: Full accumulated text including current chunk.
            delta_text: New text in this chunk.
            previous_token_ids: Token ids from prior chunks.
            current_token_ids: Full token ids including current chunk.
            output_token_ids: Raw output token ids from the engine.
            reasoning_ended: Whether reasoning has already ended.
            prompt_is_reasoning_end: Whether the prompt itself ends reasoning.
            request: The originating chat completion request.
        """
        delta_message: DeltaMessage | None = None
        tools_called = False
        reasoning_ended_at_entry = reasoning_ended

        # For MistralReasoningParser, only enter the reasoning block when
        # the model has actually emitted a [THINK] token.  Other reasoning
        # parsers always expect thinking to be present.
        expect_thinking = (
            not isinstance(reasoning_parser, MistralReasoningParser)
            or reasoning_parser.start_token_id in current_token_ids
        )
        if reasoning_parser is not None and not reasoning_ended and expect_thinking:
            # Pre-v15 models may have pre-filled [THINK]...[/THINK] in
            # system prompts, so skip the prompt-level reasoning-end
            # check and wait for the output's own end-of-think.
            is_pre_v15 = (
                isinstance(self.model_tokenizer, MistralTokenizer)
                and self.model_tokenizer.version < 15
            )

            if not is_pre_v15 and prompt_is_reasoning_end:
                reasoning_ended = True
                current_token_ids = list(output_token_ids)
            else:
                delta_message = reasoning_parser.extract_reasoning_streaming(
                    previous_text,
                    current_text,
                    delta_text,
                    previous_token_ids,
                    current_token_ids,
                    output_token_ids,
                )
                if reasoning_parser.is_reasoning_end_streaming(
                    current_token_ids, output_token_ids
                ):
                    reasoning_ended = True
                    current_token_ids = reasoning_parser.extract_content_ids(
                        list(output_token_ids)
                    )
                    if delta_message and delta_message.content:
                        current_text = delta_message.content
                        delta_message.content = None
                    else:
                        current_text = ""

            if not reasoning_ended:
                return MistralStreamingResult(
                    delta_message=delta_message,
                    reasoning_ended=False,
                    tools_called=False,
                    current_text=current_text,
                    current_token_ids=current_token_ids,
                )

        delta_token_ids = list(output_token_ids)

        # On the iteration where reasoning just ended, reset the text/token
        # state so the tool parser sees a clean history instead of the
        # accumulated reasoning text.
        if not reasoning_ended_at_entry and reasoning_ended:
            previous_text = ""
            previous_token_ids = []
            delta_text = current_text
            delta_token_ids = current_token_ids

        delta_message = self.extract_tool_calls_streaming(
            previous_text=previous_text,
            current_text=current_text,
            delta_text=delta_text,
            previous_token_ids=previous_token_ids,
            current_token_ids=current_token_ids,
            delta_token_ids=delta_token_ids,
            request=request,
        )
        if delta_message and delta_message.tool_calls:
            tools_called = True

        return MistralStreamingResult(
            delta_message=delta_message,
            reasoning_ended=reasoning_ended,
            tools_called=tools_called,
            current_text=current_text,
            current_token_ids=current_token_ids,
        )

    @staticmethod
    def build_non_streaming_tool_calls(
        tool_calls: list[FunctionCall] | None,
    ) -> list[ToolCall]:
        r"""Build `MistralToolCall` items for non-streaming responses."""
        if not tool_calls:
            return []

        return [
            MistralToolCall(id=tc.id, function=tc)
            if tc.id
            else MistralToolCall(function=tc)
            for tc in tool_calls
        ]

431
432
433
434
435
    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
436
        """
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
        Extract the tool calls from a complete model response.

        Content and tool calls formatting depends on the Mistral's tokenizer version
        used to train the model:

        - < v11: `content[BOT] [{tool_call1},{tool_call2}]`
        - >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}`

        with [BOT] the tool call token.

        Note:
            For tokenizer versions >= v11, tool calls with arguments wrongly formatted
            are still returned as tool calls. This is to allow the model to know it
            tried to make a tool call. It reduces chance of another failure and
            prevents that the context is filled with tool calls wrongly placed in
            assistant message contents.
453
454
        """

455
        # If the tool call token is not present, return a text response
456
        if self.bot_token not in model_output:
457
458
459
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )
460

461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
        content_and_raw_tool_calls = model_output.split(self.bot_token)
        content = content_and_raw_tool_calls[0]
        raw_tool_calls = content_and_raw_tool_calls[1:]

        # >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}
        if not self._is_pre_v11:
            tool_calls = []
            for raw_tool_call in raw_tool_calls:
                if "{" not in raw_tool_call:
                    continue

                end_name = raw_tool_call.find("{")
                tool_name, args = (
                    raw_tool_call[:end_name],
                    raw_tool_call[end_name:],
                )

478
479
                # HF tokenizers may include [ARGS] in the text
                tool_name = tool_name.replace("[ARGS]", "")
480
                tool_calls.append({"name": tool_name, "arguments": args})
481

482
483
484
485
486
487
488
489
        # < v11: content[BOT] [{tool_call1},{tool_call2}]
        else:
            if len(raw_tool_calls) != 1:
                raise ValueError(
                    "Only one BOT token should have been outputted, "
                    f"but got {model_output}."
                )
            stringified_tool_calls = raw_tool_calls[0].strip()
490
            try:
491
492
493
494
                # Use raw_decode to parse the first valid JSON value,
                # ignoring trailing tokens the model may emit after
                # the tool call array.
                tool_calls, _ = json.JSONDecoder().raw_decode(stringified_tool_calls)
495
            except json.JSONDecodeError:
496
497
498
499
500
                try:
                    raw_tool_call = self.tool_call_regex.findall(
                        stringified_tool_calls
                    )[0]
                    tool_calls = json.loads(raw_tool_call)
501
502
503
504
505
506
507
508
509
510
                    tool_calls = [
                        {
                            "name": tool_call["name"],
                            "arguments": json.dumps(
                                tool_call.get("arguments", {}),
                                ensure_ascii=False,
                            ),
                        }
                        for tool_call in tool_calls
                    ]
511
                except (IndexError, json.JSONDecodeError):
512
                    logger.exception("Error in extracting tool call from response.")
513
514
515
516
517
518
519
520
521
522
                    return ExtractedToolCallInformation(
                        tools_called=False,
                        tool_calls=[],
                        content=stringified_tool_calls,
                    )
            else:
                tool_calls = [
                    {
                        "name": tool_call["name"],
                        "arguments": json.dumps(
523
524
                            tool_call.get("arguments", {}),
                            ensure_ascii=False,
525
                        ),
526
527
528
529
530
531
532
533
534
                    }
                    for tool_call in tool_calls
                ]

        mistral_tool_calls: list[MistralToolCall] = [
            MistralToolCall(
                type="function",
                function=FunctionCall(
                    name=tool_call["name"],
535
                    arguments=tool_call.get("arguments", "{}"),
536
                ),
537
            )
538
539
540
541
542
543
544
545
            for tool_call in tool_calls
        ]

        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=mistral_tool_calls,
            content=content if len(content) > 0 else None,
        )
546
547
548
549
550
551
552
553
554

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
555
        request: ChatCompletionRequest,
556
    ) -> DeltaMessage | None:
557
558
559
560
        has_bot_token = (
            self.bot_token_id in current_token_ids or self.bot_token in current_text
        )
        if not has_bot_token:
561
562
            # if the tool call token is not in the tokens generated so far,
            # append output to contents since it's not a tool
563
564
            return DeltaMessage(content=delta_text)

565
        # if the tool call token IS in the tokens generated so far, that
566
567
        # means we're parsing as tool calls now
        try:
568
            if self._is_pre_v11:
569
570
571
                return self._extract_tool_calls_streaming_pre_v11_tokenizer(
                    delta_text=delta_text,
                    delta_token_ids=delta_token_ids,
572
                )
573
574
575
576
577
578
579
            else:
                return self._extract_tool_calls_streaming(
                    delta_text=delta_text, delta_token_ids=delta_token_ids
                )
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            return None
580

581
582
583
584
585
586
587
588
589
590
591
592
593
    def _extract_tool_calls_streaming(
        self,
        delta_text: str,
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extracts tool calls for Mistral models
        doing tool calls of the following format:
        `[TOOL_CALLS]add{"a": 3.5, "b": 4}`
        """
        additional_content: str = ""
        if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
            # this is the first tool call
594
595
            if self.bot_token not in delta_text:
                return DeltaMessage(content=delta_text)
596
597
598
599
600
            if not delta_text.startswith(self.bot_token):
                additional_content += delta_text.split(self.bot_token)[0]
                delta_text = self.bot_token + "".join(
                    delta_text.split(self.bot_token)[1:]
                )
601

602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
        delta_tool_calls = self._generate_delta_tool_call(delta_text)
        if not additional_content and len(delta_tool_calls) == 0:
            if self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS,
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
                StreamingState.TOOL_COMPLETE,
                StreamingState.ALL_TOOLS_COMPLETE,
            ]:
                # Return an empty DeltaMessage once the tool calls are all done
                # so that finish_reason gets set.
                return DeltaMessage()
            else:
                # return None when the tool is not likely to be finished
                # This can occur when the name is being parsed for example
                # and we wait for the name to be complete
                # before sending the function name
618
619
                return None

620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
        delta = DeltaMessage()
        if additional_content:
            delta.content = additional_content
        if len(delta_tool_calls) > 0:
            delta.tool_calls = delta_tool_calls

        # HACK: serving_chat.py inspects the internal state of tool parsers
        # when determining its final streaming delta, automatically
        # adding autocompleted JSON.
        # These two lines avoid that nonsense while ensuring finish_reason
        # is set to tool_calls when at least one tool is called.
        if delta_tool_calls and not self.prev_tool_call_arr:
            self.prev_tool_call_arr = [{"arguments": {}}]
        return delta

    def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
        if delta_text == "" or delta_text is None:
            return []
        delta_function_name = None
        tool_id = None
        if self.streaming_state not in [
            StreamingState.PARSING_NAME,
            StreamingState.PARSING_ARGUMENTS,
        ] and delta_text.startswith(self.bot_token):
            self.current_tool_id += 1
            self.streaming_state = StreamingState.PARSING_NAME
            delta_text = delta_text.replace(self.bot_token, "", 1)
        if self.streaming_state == StreamingState.PARSING_NAME:
            if self.current_tool_name is None:
                self.current_tool_name = ""
            # The name stops where the arguments start
            # And the arguments start with the `{` char
            if "{" in delta_text:
                tool_id = MistralToolCall.generate_random_id()
                delta_function_name = delta_text.split("{")[0]
                self.current_tool_name += delta_function_name
656
657
                # HF tokenizers may include [ARGS] in the text
                self.current_tool_name = self.current_tool_name.replace("[ARGS]", "")
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
                delta_text = delta_text[len(delta_function_name) :]
                self.streaming_state = StreamingState.PARSING_ARGUMENTS
            else:
                # we want to send the tool name once it's complete
                self.current_tool_name += delta_text
                return []
        if self.streaming_state == StreamingState.PARSING_ARGUMENTS:
            next_function_text = None
            if self.bot_token in delta_text:
                # current tool call is over
                delta_arguments = ""
                delta_arguments += delta_text.split(self.bot_token)[0]
                next_function_text = delta_text[len(delta_arguments) :]
                self.streaming_state = StreamingState.TOOL_COMPLETE
            else:
                delta_arguments = delta_text
            ret = []
            if self.current_tool_name or delta_arguments:
                ret += [
                    DeltaToolCall(
                        index=self.current_tool_id,
                        type="function",
                        id=tool_id,
                        function=DeltaFunctionCall(
                            name=self.current_tool_name, arguments=delta_arguments
                        ).model_dump(exclude_none=True),
684
                    )
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
                ]
                self.current_tool_name = None
            if next_function_text:
                ret += self._generate_delta_tool_call(next_function_text)
            return ret
        # Should not happen
        return []

    @ijson.coroutine
    def update_stream_state_pre_v11_tokenizer(self):
        while True:
            (prefix, event, value) = yield

            if prefix == "item" and event == "start_map":
                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
            if prefix == "item" and event == "map_key" and value == "name":
                self.streaming_state = StreamingState.PARSING_NAME
            if prefix == "item.name" and event == "string":
                self.current_tool_name = value
                self.streaming_state = StreamingState.PARSING_NAME_COMPLETED
            if prefix == "item" and event == "map_key" and value == "arguments":
                self.streaming_state = StreamingState.WAITING_FOR_ARGUMENTS_START
            if prefix == "item.arguments" and event == "start_map":
                self.streaming_state = StreamingState.PARSING_ARGUMENTS
            if prefix == "item.arguments" and event == "end_map":
                self.streaming_state = StreamingState.PARSING_ARGUMENTS_COMPLETED
            if prefix == "item" and event == "end_map":
                self.streaming_state = StreamingState.TOOL_COMPLETE
            if prefix == "" and event == "end_array":
                self.streaming_state = StreamingState.ALL_TOOLS_COMPLETE

    def _extract_tool_calls_streaming_pre_v11_tokenizer(
        self,
        delta_text: str,
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extracts tool calls for Mistral models
        doing tool calls of the following format:
        `[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}`
        """
        assert self.parse_coro is not None
        content = None
        delta_tool_calls: list[DeltaToolCall] = []
        current_tool_call: DeltaToolCall = DeltaToolCall(
            index=self.current_tool_id, type="function"
        )
        current_tool_call_modified = False
733
        if self.bot_token_id in delta_token_ids or self.bot_token in delta_text:
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
            # this is the first tool call
            if not delta_text.startswith(self.bot_token):
                content = delta_text.split(self.bot_token)[0]
            delta_text = "".join(delta_text.split(self.bot_token)[1:])

        # Cut smartly the delta text to catch the ijson events
        # as ijson does not give us the index in the text at each event.
        # We need to cut so that we know
        # where in the text the events are emitted from.
        while len(delta_text) > 0:
            streaming_state_before_parse = self.streaming_state

            if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                )
            elif self.streaming_state == StreamingState.WAITING_FOR_TOOL_KEY:
                # Wait until another key is sent
                # or the current tool is completed
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_colon=1,
                    stop_after_opening_curly_braces=1,
                    # if the tool ends, we want to separate
                    # at the start of the next tool
                )
            elif self.streaming_state == StreamingState.PARSING_NAME:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_comma=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.WAITING_FOR_ARGUMENTS_START:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                )
            elif self.streaming_state == StreamingState.PARSING_ARGUMENTS:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_closing_curly_braces=1,
                    # we could be more clever
                    # by listening to item.arguments.* start_map events
                    # and know how many curly braces we can allow
                )
            elif self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
                StreamingState.PARSING_NAME_COMPLETED,
            ]:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_closing_curly_braces=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.TOOL_COMPLETE:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
                content = delta_text
                delta_text = ""
798
            else:
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
                delta_to_be_parsed = delta_text
                delta_text = ""

            if self.streaming_state != StreamingState.ALL_TOOLS_COMPLETE:
                self.parse_coro.send(delta_to_be_parsed.encode("utf-8"))

            # Given the parsed text and the possible streaming state change,
            # let's add to the tool delta
            if (
                (streaming_state_before_parse != self.streaming_state)
                and streaming_state_before_parse
                in [StreamingState.WAITING_FOR_TOOL_START, StreamingState.TOOL_COMPLETE]
                and self.streaming_state
                not in [
                    StreamingState.ALL_TOOLS_COMPLETE,
                    StreamingState.TOOL_COMPLETE,
                    StreamingState.WAITING_FOR_TOOL_START,
                ]
            ):
                # starting a new tool call
                if current_tool_call_modified:
                    if self.current_tool_mistral_id is not None:
                        current_tool_call.id = self.current_tool_mistral_id
                        self.current_tool_mistral_id = None
                    delta_tool_calls.append(current_tool_call)
                current_tool_call_modified = False
                self.current_tool_id += 1
                self.current_tool_mistral_id = MistralToolCall.generate_random_id()
                current_tool_call = DeltaToolCall(
                    index=self.current_tool_id,
                    type="function",
830
                )
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
            if current_tool_call.function is None:
                current_tool_call.function = DeltaFunctionCall()

            if self.current_tool_name is not None:
                # we have the complete tool name
                current_tool_call_modified = True
                current_tool_call.function.name = self.current_tool_name
                self.current_tool_name = None
            if self.streaming_state == StreamingState.PARSING_NAME_COMPLETED:
                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
            if self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS,
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
            ]:
                if self.streaming_state == StreamingState.PARSING_ARGUMENTS_COMPLETED:
                    self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
                # the delta_to_be_parsed is part of arguments.
                current_tool_call_modified = True
                if current_tool_call.function.arguments is None:
                    current_tool_call.function.arguments = delta_to_be_parsed
                else:
                    current_tool_call.function.arguments += delta_to_be_parsed
                if streaming_state_before_parse != StreamingState.PARSING_ARGUMENTS:
                    # It's the first chunk of arg. let's lstrip it
                    current_tool_call.function.arguments = (
                        current_tool_call.function.arguments.lstrip()
857
                    )
858

859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
        if current_tool_call_modified:
            if self.current_tool_mistral_id is not None:
                current_tool_call.id = self.current_tool_mistral_id
                self.current_tool_mistral_id = None
            delta_tool_calls.append(current_tool_call)

        # HACK: serving_chat.py inspects the internal state of tool parsers
        # when determining it's final streaming delta, automatically
        # adding autocompleted JSON.
        # These two lines avoid that nonsense while ensuring finish_reason
        # is set to tool_calls when at least one tool is called.
        if delta_tool_calls and not self.prev_tool_call_arr:
            self.prev_tool_call_arr = [{"arguments": {}}]

        if content or len(delta_tool_calls) > 0:
            delta_message = DeltaMessage()
            if content:
                delta_message.content = content
            if len(delta_tool_calls) > 0:
                delta_message.tool_calls = delta_tool_calls
            return delta_message
        else:
            if self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
                return DeltaMessage()
            else:
                return None
885

886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
    def _split_delta(
        self,
        delta_text: str,
        stop_after_quotes: int = -1,
        stop_after_opening_curly_braces: int = -1,
        stop_after_closing_curly_braces: int = -1,
        stop_after_closing_brackets: int = -1,
        stop_after_colon: int = -1,
        stop_after_comma=-1,
    ) -> tuple[str, str]:
        delta_to_be_parsed = ""
        for i, c in enumerate(delta_text):
            if c in ['"', "'"]:
                delta_to_be_parsed += c
                stop_after_quotes -= 1
                if stop_after_quotes == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "{":
                delta_to_be_parsed += c
                stop_after_opening_curly_braces -= 1
                if stop_after_opening_curly_braces == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "}":
                delta_to_be_parsed += c
                stop_after_closing_curly_braces -= 1
                if stop_after_closing_curly_braces == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "]":
                delta_to_be_parsed += c
                stop_after_closing_brackets -= 1
                if stop_after_closing_brackets == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == ":":
                delta_to_be_parsed += c
                stop_after_colon -= 1
                if stop_after_colon == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == ",":
                delta_to_be_parsed += c
                stop_after_comma -= 1
                if stop_after_comma == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            else:
                delta_to_be_parsed += c
930

931
        return (delta_to_be_parsed, "")