mistral_tool_parser.py 37.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
from __future__ import annotations

6
import json
7
from collections.abc import Sequence
8
from dataclasses import dataclass
9
from enum import Enum, auto
10
11
from random import choices
from string import ascii_letters, digits
12
from typing import TYPE_CHECKING, Any
13

14
import ijson
15
import regex as re
16
17
18
19
20
21
22
23
24
25
26
27
from mistral_common.protocol.instruct.tool_calls import (
    NamedToolChoice as MistralNamedToolChoice,
)
from mistral_common.protocol.instruct.tool_calls import (
    Tool as MistralTool,
)
from mistral_common.protocol.instruct.tool_calls import (
    ToolChoice as MistralToolChoice,
)
from mistral_common.protocol.instruct.tool_calls import (
    ToolChoiceEnum as MistralToolChoiceEnum,
)
28
from pydantic import Field
29

30
from vllm.entrypoints.openai.chat_completion.protocol import (
31
    ChatCompletionRequest,
32
33
)
from vllm.entrypoints.openai.engine.protocol import (
34
35
36
37
38
39
40
    DeltaFunctionCall,
    DeltaMessage,
    DeltaToolCall,
    ExtractedToolCallInformation,
    FunctionCall,
    ToolCall,
)
41
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
42
from vllm.logger import init_logger
43
from vllm.reasoning.mistral_reasoning_parser import MistralReasoningParser
44
from vllm.sampling_params import StructuredOutputsParams
45
from vllm.tokenizers import TokenizerLike
46
from vllm.tokenizers.mistral import MistralTokenizer, adapt_inplace_to_mistral_tool
47
from vllm.tool_parsers.abstract_tool_parser import (
48
    Tool,
49
50
    ToolParser,
)
51
from vllm.utils.mistral import is_mistral_tokenizer
52

53
54
55
if TYPE_CHECKING:
    from vllm.reasoning import ReasoningParser

56
57
logger = init_logger(__name__)

58
59
ALPHANUMERIC = ascii_letters + digits

60
61
_DEFAULT_JSON_SCHEMA = {"anyOf": [{"type": "object"}, {"type": "array"}]}

62

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class StreamingState(Enum):
    """Enum for tracking the current streaming parsing state."""

    WAITING_FOR_TOOL_START = auto()
    WAITING_FOR_TOOL_KEY = (
        auto()
    )  # waiting for the "name" or "arguments" key to be complete
    PARSING_NAME = auto()
    PARSING_NAME_COMPLETED = auto()
    WAITING_FOR_ARGUMENTS_START = auto()
    PARSING_ARGUMENTS = auto()
    PARSING_ARGUMENTS_COMPLETED = auto()
    TOOL_COMPLETE = auto()
    ALL_TOOLS_COMPLETE = auto()


79
class MistralToolCall(ToolCall):
80
    id: str = Field(default_factory=lambda: MistralToolCall.generate_random_id())
81
82
83

    @staticmethod
    def generate_random_id():
84
        # Mistral Tool Call Ids must be alphanumeric with a length of 9.
85
86
87
        # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
        return "".join(choices(ALPHANUMERIC, k=9))

88
89
90
91
    @staticmethod
    def is_valid_id(id: str) -> bool:
        return id.isalnum() and len(id) == 9

92

93
def _is_pre_v11_tokeniser(model_tokenizer: TokenizerLike) -> bool:
94
    return not (is_mistral_tokenizer(model_tokenizer) and model_tokenizer.version >= 11)
95
96


97
98
99
100
@dataclass
class MistralStreamingResult:
    r"""Encapsulates the mutable state returned from
    `MistralToolParser.extract_maybe_reasoning_and_tool_streaming`.
101
102
    """

103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    delta_message: DeltaMessage | None
    reasoning_ended: bool
    tools_called: bool
    current_text: str
    current_token_ids: list[int]


class MistralToolParser(ToolParser):
    r"""Tool call parser for Mistral models, intended for use with either:

    - `mistral_common <https://github.com/mistralai/mistral-common/>`_
      (recommended)
    - the `examples/tool_chat_template_mistral.jinja` template.

    Used when `--enable-auto-tool-choice --tool-call-parser mistral` are all
    set.
119
120
    """

121
122
123
    # Used to generate correct grammar in `adjust_request`
    model_can_reason: bool = False

124
125
    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
        super().__init__(tokenizer, tools)
126

127
        if not is_mistral_tokenizer(self.model_tokenizer):
128
            logger.info("Non-Mistral tokenizer detected when using a Mistral model...")
129
130
131

        # initialize properties used for state when parsing tool calls in
        # streaming mode
132
        self.prev_tool_call_arr: list[dict[str, Any]] = []
133
        self.current_tool_id: int = -1
134
135
136
137
138
139
140
141
142
143
144
        self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START

        # For streaming pre v11 tokenizer tool calls
        self.current_tool_name: str | None = None
        self.current_tool_mistral_id: str | None = None
        self.starting_new_tool = False
        if _is_pre_v11_tokeniser(self.model_tokenizer):
            self.parse_coro = ijson.parse_coro(
                self.update_stream_state_pre_v11_tokenizer()
            )

145
        self.bot_token = "[TOOL_CALLS]"
146
        self.bot_token_id = self.vocab.get(self.bot_token)
147
        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
148
        self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer)
149

150
        if self.bot_token_id is None:
151
152
            raise RuntimeError(
                "Mistral Tool Parser could not locate the tool call token in "
153
154
155
                "the tokenizer!"
            )

156
157
158
    def adjust_request(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> ChatCompletionRequest | ResponsesRequest:
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
        so_non_supported_attributes = [
            "regex",
            "choice",
            "grammar",
            # whitespace_pattern is not a constraint type but an option;
            # Mistral grammar factory does not support it.
            "whitespace_pattern",
            "structural_tag",
        ]
        any_so_non_supported_active = request.structured_outputs is not None and any(
            getattr(request.structured_outputs, attribute) is not None
            for attribute in so_non_supported_attributes
        )
        response_format_non_supported_active = (
            isinstance(request, ResponsesRequest)
            or request.response_format is not None
            and request.response_format.type == "structural_tag"
        )

178
        if (
179
            not is_mistral_tokenizer(self.model_tokenizer)
180
181
182
183
            or isinstance(request, ResponsesRequest)
            or not self.model_tokenizer.supports_grammar
            or any_so_non_supported_active
            or response_format_non_supported_active
184
        ):
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
            request = super().adjust_request(request)
            if request.tools and request.tool_choice != "none":
                # Do not skip special tokens when using chat template
                # with Mistral parser as TOOL_CALL token is needed
                # for tool detection.
                # Note: we don't want skip_special_tokens=False
                # with MistralTokenizer as it is incompatible
                request.skip_special_tokens = False
            return request

        json_schema: dict[str, Any] | None = None
        if request.structured_outputs is not None:
            if request.structured_outputs.json_object is not None:
                json_schema = _DEFAULT_JSON_SCHEMA
            elif request.structured_outputs.json is not None:
                if isinstance(request.structured_outputs.json, str):
                    json_schema = json.loads(request.structured_outputs.json)
                else:
                    json_schema = request.structured_outputs.json
            else:
                raise ValueError(
                    "Unsupported request.structured_outputs for MistralToolParser. "
                    "Only `json` and `json_object` are supported."
                )
        elif (
            request.response_format is not None
            and request.response_format.type != "text"
        ):
            if request.response_format.type == "json_object":
                json_schema = _DEFAULT_JSON_SCHEMA
            elif request.response_format.type == "json_schema":
                if request.response_format.json_schema is not None:
                    json_schema = request.response_format.json_schema.json_schema
                else:
                    json_schema = _DEFAULT_JSON_SCHEMA
            else:
                raise ValueError(
                    "MistralToolParser only accepts `text`, `json_object` or "
                    f"`json_schema`, got {request.response_format=}"
                )
            # Structured Outputs will be defined.
            request.response_format = None

        grammar_factory = self.model_tokenizer.grammar_factory

        # TODO: Once unified parser, improve this.
        # The issue is figuring out when a model is a reasoning one or not.
        template = grammar_factory.select_jinja_template(
            reasoning=self.model_can_reason
        )

236
        mistral_tools = (
237
            [
238
239
240
                MistralTool.model_validate(
                    adapt_inplace_to_mistral_tool(tool.model_dump())
                )
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
                for tool in request.tools
            ]
            if request.tools is not None
            else None
        )

        tool_choice: MistralToolChoice
        match request.tool_choice:
            case "none" | "auto" | "required":
                tool_choice = MistralToolChoiceEnum(request.tool_choice)
            case None:
                tool_choice = MistralToolChoiceEnum.auto
            # _ == Named tool choice
            case _:
                tool_choice = MistralNamedToolChoice.model_validate(
                    {
                        "type": "function",
                        "function": {"name": request.tool_choice.function.name},
                    }
                )

        # Rendering grammar is cached in mistral-common given tools, template and mode.
        match tool_choice, json_schema is not None:
            case MistralToolChoiceEnum.none, True:
                lark_grammar = grammar_factory.get_lark_for_json_schema(
                    template=template, json_schema=json_schema
                )
            case _, _:
                lark_grammar = grammar_factory.get_lark_from_jinja(
                    template=template,
                    mode=tool_choice,
272
                    tools=mistral_tools,
273
274
275
276
277
278
                    json_schema=json_schema,
                    parallel_tool_calls=request.parallel_tool_calls,
                    json_only=False,
                )

        request.structured_outputs = StructuredOutputsParams(grammar=lark_grammar)
279
        request._grammar_from_tool_parser = True
280
281
        return request

282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
    def extract_maybe_reasoning_and_tool_streaming(
        self,
        *,
        reasoning_parser: ReasoningParser | None,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: list[int],
        current_token_ids: list[int],
        output_token_ids: Sequence[int],
        reasoning_ended: bool,
        prompt_is_reasoning_end: bool | None,
        request: ChatCompletionRequest,
    ) -> MistralStreamingResult:
        r"""Streaming extraction with reasoning followed by tool-call parsing.

        This method encapsulates the combined reasoning extraction and
        tool-call streaming logic so that the serving layer only needs a
        thin routing branch.

        The flow is:

        1. If a *reasoning_parser* is present and reasoning has **not** ended,
           extract reasoning tokens.  Pre-v15 models may have pre-filled
           `[THINK]...[/THINK]` in system prompts, so we skip the
           prompt-level reasoning-end check for those.
        2. Once reasoning ends (or if there is no reasoning parser), delegate
           to `extract_tool_calls_streaming` and track whether tools were
           called.

        Args:
            reasoning_parser: Optional reasoning parser instance.
            previous_text: Accumulated text from prior chunks.
            current_text: Full accumulated text including current chunk.
            delta_text: New text in this chunk.
            previous_token_ids: Token ids from prior chunks.
            current_token_ids: Full token ids including current chunk.
            output_token_ids: Raw output token ids from the engine.
            reasoning_ended: Whether reasoning has already ended.
            prompt_is_reasoning_end: Whether the prompt itself ends reasoning.
            request: The originating chat completion request.
        """
        delta_message: DeltaMessage | None = None
        tools_called = False
        reasoning_ended_at_entry = reasoning_ended

        # For MistralReasoningParser, only enter the reasoning block when
        # the model has actually emitted a [THINK] token.  Other reasoning
        # parsers always expect thinking to be present.
        expect_thinking = (
            not isinstance(reasoning_parser, MistralReasoningParser)
            or reasoning_parser.start_token_id in current_token_ids
        )
        if reasoning_parser is not None and not reasoning_ended and expect_thinking:
            # Pre-v15 models may have pre-filled [THINK]...[/THINK] in
            # system prompts, so skip the prompt-level reasoning-end
            # check and wait for the output's own end-of-think.
            is_pre_v15 = (
                isinstance(self.model_tokenizer, MistralTokenizer)
                and self.model_tokenizer.version < 15
            )

            if not is_pre_v15 and prompt_is_reasoning_end:
                reasoning_ended = True
                current_token_ids = list(output_token_ids)
            else:
                delta_message = reasoning_parser.extract_reasoning_streaming(
                    previous_text,
                    current_text,
                    delta_text,
                    previous_token_ids,
                    current_token_ids,
                    output_token_ids,
                )
                if reasoning_parser.is_reasoning_end_streaming(
                    current_token_ids, output_token_ids
                ):
                    reasoning_ended = True
                    current_token_ids = reasoning_parser.extract_content_ids(
                        list(output_token_ids)
                    )
                    if delta_message and delta_message.content:
                        current_text = delta_message.content
                        delta_message.content = None
                    else:
                        current_text = ""

            if not reasoning_ended:
                return MistralStreamingResult(
                    delta_message=delta_message,
                    reasoning_ended=False,
                    tools_called=False,
                    current_text=current_text,
                    current_token_ids=current_token_ids,
                )

        delta_token_ids = list(output_token_ids)

        # On the iteration where reasoning just ended, reset the text/token
        # state so the tool parser sees a clean history instead of the
        # accumulated reasoning text.
        if not reasoning_ended_at_entry and reasoning_ended:
            previous_text = ""
            previous_token_ids = []
            delta_text = current_text
            delta_token_ids = current_token_ids

        delta_message = self.extract_tool_calls_streaming(
            previous_text=previous_text,
            current_text=current_text,
            delta_text=delta_text,
            previous_token_ids=previous_token_ids,
            current_token_ids=current_token_ids,
            delta_token_ids=delta_token_ids,
            request=request,
        )
        if delta_message and delta_message.tool_calls:
            tools_called = True

        return MistralStreamingResult(
            delta_message=delta_message,
            reasoning_ended=reasoning_ended,
            tools_called=tools_called,
            current_text=current_text,
            current_token_ids=current_token_ids,
        )

    @staticmethod
    def build_non_streaming_tool_calls(
        tool_calls: list[FunctionCall] | None,
    ) -> list[ToolCall]:
        r"""Build `MistralToolCall` items for non-streaming responses."""
        if not tool_calls:
            return []

        return [
            MistralToolCall(id=tc.id, function=tc)
            if tc.id
            else MistralToolCall(function=tc)
            for tc in tool_calls
        ]

424
425
426
427
428
    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
429
        """
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
        Extract the tool calls from a complete model response.

        Content and tool calls formatting depends on the Mistral's tokenizer version
        used to train the model:

        - < v11: `content[BOT] [{tool_call1},{tool_call2}]`
        - >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}`

        with [BOT] the tool call token.

        Note:
            For tokenizer versions >= v11, tool calls with arguments wrongly formatted
            are still returned as tool calls. This is to allow the model to know it
            tried to make a tool call. It reduces chance of another failure and
            prevents that the context is filled with tool calls wrongly placed in
            assistant message contents.
446
447
        """

448
        # If the tool call token is not present, return a text response
449
        if self.bot_token not in model_output:
450
451
452
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )
453

454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
        content_and_raw_tool_calls = model_output.split(self.bot_token)
        content = content_and_raw_tool_calls[0]
        raw_tool_calls = content_and_raw_tool_calls[1:]

        # >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}
        if not self._is_pre_v11:
            tool_calls = []
            for raw_tool_call in raw_tool_calls:
                if "{" not in raw_tool_call:
                    continue

                end_name = raw_tool_call.find("{")
                tool_name, args = (
                    raw_tool_call[:end_name],
                    raw_tool_call[end_name:],
                )

                tool_calls.append({"name": tool_name, "arguments": args})
472

473
474
475
476
477
478
479
480
        # < v11: content[BOT] [{tool_call1},{tool_call2}]
        else:
            if len(raw_tool_calls) != 1:
                raise ValueError(
                    "Only one BOT token should have been outputted, "
                    f"but got {model_output}."
                )
            stringified_tool_calls = raw_tool_calls[0].strip()
481
            try:
482
                tool_calls = json.loads(stringified_tool_calls)
483
484
485
            except json.JSONDecodeError:
                # use a regex to find the part corresponding to the tool call.
                # NOTE: This use case should not happen if the model is trained
486
                # correctly. It's an easy possible fix so it's included, but
487
                # can be brittle for very complex / highly nested tool calls
488
489
490
491
492
493
                try:
                    raw_tool_call = self.tool_call_regex.findall(
                        stringified_tool_calls
                    )[0]
                    tool_calls = json.loads(raw_tool_call)
                except (IndexError, json.JSONDecodeError):
494
                    logger.exception("Error in extracting tool call from response.")
495
496
497
498
499
500
501
502
503
504
505
506
507
                    # If raw decoding and decoding post regex rule fails, then just
                    # return content.
                    return ExtractedToolCallInformation(
                        tools_called=False,
                        tool_calls=[],
                        content=stringified_tool_calls,
                    )
            else:
                tool_calls = [
                    {
                        "name": tool_call["name"],
                        "arguments": json.dumps(
                            tool_call["arguments"], ensure_ascii=False
508
                        ),
509
510
511
512
513
514
515
516
517
518
519
                    }
                    for tool_call in tool_calls
                ]

        mistral_tool_calls: list[MistralToolCall] = [
            MistralToolCall(
                type="function",
                function=FunctionCall(
                    name=tool_call["name"],
                    arguments=tool_call["arguments"],
                ),
520
            )
521
522
523
524
525
526
527
528
            for tool_call in tool_calls
        ]

        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=mistral_tool_calls,
            content=content if len(content) > 0 else None,
        )
529
530
531
532
533
534
535
536
537

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
538
        request: ChatCompletionRequest,
539
    ) -> DeltaMessage | None:
540
541
542
543
        has_bot_token = (
            self.bot_token_id in current_token_ids or self.bot_token in current_text
        )
        if not has_bot_token:
544
545
            # if the tool call token is not in the tokens generated so far,
            # append output to contents since it's not a tool
546
547
            return DeltaMessage(content=delta_text)

548
        # if the tool call token IS in the tokens generated so far, that
549
550
        # means we're parsing as tool calls now
        try:
551
552
553
554
            if _is_pre_v11_tokeniser(self.model_tokenizer):
                return self._extract_tool_calls_streaming_pre_v11_tokenizer(
                    delta_text=delta_text,
                    delta_token_ids=delta_token_ids,
555
                )
556
557
558
559
560
561
562
            else:
                return self._extract_tool_calls_streaming(
                    delta_text=delta_text, delta_token_ids=delta_token_ids
                )
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            return None
563

564
565
566
567
568
569
570
571
572
573
574
575
576
    def _extract_tool_calls_streaming(
        self,
        delta_text: str,
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extracts tool calls for Mistral models
        doing tool calls of the following format:
        `[TOOL_CALLS]add{"a": 3.5, "b": 4}`
        """
        additional_content: str = ""
        if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
            # this is the first tool call
577
578
            if self.bot_token not in delta_text:
                return DeltaMessage(content=delta_text)
579
580
581
582
583
            if not delta_text.startswith(self.bot_token):
                additional_content += delta_text.split(self.bot_token)[0]
                delta_text = self.bot_token + "".join(
                    delta_text.split(self.bot_token)[1:]
                )
584

585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
        delta_tool_calls = self._generate_delta_tool_call(delta_text)
        if not additional_content and len(delta_tool_calls) == 0:
            if self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS,
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
                StreamingState.TOOL_COMPLETE,
                StreamingState.ALL_TOOLS_COMPLETE,
            ]:
                # Return an empty DeltaMessage once the tool calls are all done
                # so that finish_reason gets set.
                return DeltaMessage()
            else:
                # return None when the tool is not likely to be finished
                # This can occur when the name is being parsed for example
                # and we wait for the name to be complete
                # before sending the function name
601
602
                return None

603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
        delta = DeltaMessage()
        if additional_content:
            delta.content = additional_content
        if len(delta_tool_calls) > 0:
            delta.tool_calls = delta_tool_calls

        # HACK: serving_chat.py inspects the internal state of tool parsers
        # when determining its final streaming delta, automatically
        # adding autocompleted JSON.
        # These two lines avoid that nonsense while ensuring finish_reason
        # is set to tool_calls when at least one tool is called.
        if delta_tool_calls and not self.prev_tool_call_arr:
            self.prev_tool_call_arr = [{"arguments": {}}]
        return delta

    def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
        if delta_text == "" or delta_text is None:
            return []
        delta_function_name = None
        tool_id = None
        if self.streaming_state not in [
            StreamingState.PARSING_NAME,
            StreamingState.PARSING_ARGUMENTS,
        ] and delta_text.startswith(self.bot_token):
            self.current_tool_id += 1
            self.streaming_state = StreamingState.PARSING_NAME
            delta_text = delta_text.replace(self.bot_token, "", 1)
        if self.streaming_state == StreamingState.PARSING_NAME:
            if self.current_tool_name is None:
                self.current_tool_name = ""
            # The name stops where the arguments start
            # And the arguments start with the `{` char
            if "{" in delta_text:
                tool_id = MistralToolCall.generate_random_id()
                delta_function_name = delta_text.split("{")[0]
                self.current_tool_name += delta_function_name
                delta_text = delta_text[len(delta_function_name) :]
                self.streaming_state = StreamingState.PARSING_ARGUMENTS
            else:
                # we want to send the tool name once it's complete
                self.current_tool_name += delta_text
                return []
        if self.streaming_state == StreamingState.PARSING_ARGUMENTS:
            next_function_text = None
            if self.bot_token in delta_text:
                # current tool call is over
                delta_arguments = ""
                delta_arguments += delta_text.split(self.bot_token)[0]
                next_function_text = delta_text[len(delta_arguments) :]
                self.streaming_state = StreamingState.TOOL_COMPLETE
            else:
                delta_arguments = delta_text
            ret = []
            if self.current_tool_name or delta_arguments:
                ret += [
                    DeltaToolCall(
                        index=self.current_tool_id,
                        type="function",
                        id=tool_id,
                        function=DeltaFunctionCall(
                            name=self.current_tool_name, arguments=delta_arguments
                        ).model_dump(exclude_none=True),
665
                    )
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
                ]
                self.current_tool_name = None
            if next_function_text:
                ret += self._generate_delta_tool_call(next_function_text)
            return ret
        # Should not happen
        return []

    @ijson.coroutine
    def update_stream_state_pre_v11_tokenizer(self):
        while True:
            (prefix, event, value) = yield

            if prefix == "item" and event == "start_map":
                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
            if prefix == "item" and event == "map_key" and value == "name":
                self.streaming_state = StreamingState.PARSING_NAME
            if prefix == "item.name" and event == "string":
                self.current_tool_name = value
                self.streaming_state = StreamingState.PARSING_NAME_COMPLETED
            if prefix == "item" and event == "map_key" and value == "arguments":
                self.streaming_state = StreamingState.WAITING_FOR_ARGUMENTS_START
            if prefix == "item.arguments" and event == "start_map":
                self.streaming_state = StreamingState.PARSING_ARGUMENTS
            if prefix == "item.arguments" and event == "end_map":
                self.streaming_state = StreamingState.PARSING_ARGUMENTS_COMPLETED
            if prefix == "item" and event == "end_map":
                self.streaming_state = StreamingState.TOOL_COMPLETE
            if prefix == "" and event == "end_array":
                self.streaming_state = StreamingState.ALL_TOOLS_COMPLETE

    def _extract_tool_calls_streaming_pre_v11_tokenizer(
        self,
        delta_text: str,
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extracts tool calls for Mistral models
        doing tool calls of the following format:
        `[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}`
        """
        assert self.parse_coro is not None
        content = None
        delta_tool_calls: list[DeltaToolCall] = []
        current_tool_call: DeltaToolCall = DeltaToolCall(
            index=self.current_tool_id, type="function"
        )
        current_tool_call_modified = False
714
        if self.bot_token_id in delta_token_ids or self.bot_token in delta_text:
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
            # this is the first tool call
            if not delta_text.startswith(self.bot_token):
                content = delta_text.split(self.bot_token)[0]
            delta_text = "".join(delta_text.split(self.bot_token)[1:])

        # Cut smartly the delta text to catch the ijson events
        # as ijson does not give us the index in the text at each event.
        # We need to cut so that we know
        # where in the text the events are emitted from.
        while len(delta_text) > 0:
            streaming_state_before_parse = self.streaming_state

            if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                )
            elif self.streaming_state == StreamingState.WAITING_FOR_TOOL_KEY:
                # Wait until another key is sent
                # or the current tool is completed
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_colon=1,
                    stop_after_opening_curly_braces=1,
                    # if the tool ends, we want to separate
                    # at the start of the next tool
                )
            elif self.streaming_state == StreamingState.PARSING_NAME:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_comma=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.WAITING_FOR_ARGUMENTS_START:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                )
            elif self.streaming_state == StreamingState.PARSING_ARGUMENTS:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_closing_curly_braces=1,
                    # we could be more clever
                    # by listening to item.arguments.* start_map events
                    # and know how many curly braces we can allow
                )
            elif self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
                StreamingState.PARSING_NAME_COMPLETED,
            ]:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_closing_curly_braces=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.TOOL_COMPLETE:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
                content = delta_text
                delta_text = ""
779
            else:
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
                delta_to_be_parsed = delta_text
                delta_text = ""

            if self.streaming_state != StreamingState.ALL_TOOLS_COMPLETE:
                self.parse_coro.send(delta_to_be_parsed.encode("utf-8"))

            # Given the parsed text and the possible streaming state change,
            # let's add to the tool delta
            if (
                (streaming_state_before_parse != self.streaming_state)
                and streaming_state_before_parse
                in [StreamingState.WAITING_FOR_TOOL_START, StreamingState.TOOL_COMPLETE]
                and self.streaming_state
                not in [
                    StreamingState.ALL_TOOLS_COMPLETE,
                    StreamingState.TOOL_COMPLETE,
                    StreamingState.WAITING_FOR_TOOL_START,
                ]
            ):
                # starting a new tool call
                if current_tool_call_modified:
                    if self.current_tool_mistral_id is not None:
                        current_tool_call.id = self.current_tool_mistral_id
                        self.current_tool_mistral_id = None
                    delta_tool_calls.append(current_tool_call)
                current_tool_call_modified = False
                self.current_tool_id += 1
                self.current_tool_mistral_id = MistralToolCall.generate_random_id()
                current_tool_call = DeltaToolCall(
                    index=self.current_tool_id,
                    type="function",
811
                )
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
            if current_tool_call.function is None:
                current_tool_call.function = DeltaFunctionCall()

            if self.current_tool_name is not None:
                # we have the complete tool name
                current_tool_call_modified = True
                current_tool_call.function.name = self.current_tool_name
                self.current_tool_name = None
            if self.streaming_state == StreamingState.PARSING_NAME_COMPLETED:
                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
            if self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS,
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
            ]:
                if self.streaming_state == StreamingState.PARSING_ARGUMENTS_COMPLETED:
                    self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
                # the delta_to_be_parsed is part of arguments.
                current_tool_call_modified = True
                if current_tool_call.function.arguments is None:
                    current_tool_call.function.arguments = delta_to_be_parsed
                else:
                    current_tool_call.function.arguments += delta_to_be_parsed
                if streaming_state_before_parse != StreamingState.PARSING_ARGUMENTS:
                    # It's the first chunk of arg. let's lstrip it
                    current_tool_call.function.arguments = (
                        current_tool_call.function.arguments.lstrip()
838
                    )
839

840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
        if current_tool_call_modified:
            if self.current_tool_mistral_id is not None:
                current_tool_call.id = self.current_tool_mistral_id
                self.current_tool_mistral_id = None
            delta_tool_calls.append(current_tool_call)

        # HACK: serving_chat.py inspects the internal state of tool parsers
        # when determining it's final streaming delta, automatically
        # adding autocompleted JSON.
        # These two lines avoid that nonsense while ensuring finish_reason
        # is set to tool_calls when at least one tool is called.
        if delta_tool_calls and not self.prev_tool_call_arr:
            self.prev_tool_call_arr = [{"arguments": {}}]

        if content or len(delta_tool_calls) > 0:
            delta_message = DeltaMessage()
            if content:
                delta_message.content = content
            if len(delta_tool_calls) > 0:
                delta_message.tool_calls = delta_tool_calls
            return delta_message
        else:
            if self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
                return DeltaMessage()
            else:
                return None
866

867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
    def _split_delta(
        self,
        delta_text: str,
        stop_after_quotes: int = -1,
        stop_after_opening_curly_braces: int = -1,
        stop_after_closing_curly_braces: int = -1,
        stop_after_closing_brackets: int = -1,
        stop_after_colon: int = -1,
        stop_after_comma=-1,
    ) -> tuple[str, str]:
        delta_to_be_parsed = ""
        for i, c in enumerate(delta_text):
            if c in ['"', "'"]:
                delta_to_be_parsed += c
                stop_after_quotes -= 1
                if stop_after_quotes == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "{":
                delta_to_be_parsed += c
                stop_after_opening_curly_braces -= 1
                if stop_after_opening_curly_braces == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "}":
                delta_to_be_parsed += c
                stop_after_closing_curly_braces -= 1
                if stop_after_closing_curly_braces == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "]":
                delta_to_be_parsed += c
                stop_after_closing_brackets -= 1
                if stop_after_closing_brackets == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == ":":
                delta_to_be_parsed += c
                stop_after_colon -= 1
                if stop_after_colon == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == ",":
                delta_to_be_parsed += c
                stop_after_comma -= 1
                if stop_after_comma == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            else:
                delta_to_be_parsed += c
911

912
        return (delta_to_be_parsed, "")