mistral_tool_parser.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from __future__ import annotations

import json
from collections.abc import Sequence
from dataclasses import dataclass
from enum import Enum, auto
from random import choices
from string import ascii_letters, digits
from typing import TYPE_CHECKING, Any

import ijson
import regex as re
from mistral_common.protocol.instruct.tool_calls import (
    NamedToolChoice as MistralNamedToolChoice,
)
from mistral_common.protocol.instruct.tool_calls import (
    Tool as MistralTool,
)
from mistral_common.protocol.instruct.tool_calls import (
    ToolChoice as MistralToolChoice,
)
from mistral_common.protocol.instruct.tool_calls import (
    ToolChoiceEnum as MistralToolChoiceEnum,
)
from pydantic import Field

from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
    DeltaFunctionCall,
    DeltaMessage,
    DeltaToolCall,
    ExtractedToolCallInformation,
    FunctionCall,
    ToolCall,
)
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.logger import init_logger
from vllm.reasoning.mistral_reasoning_parser import MistralReasoningParser
from vllm.sampling_params import StructuredOutputsParams
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer, adapt_inplace_to_mistral_tool
from vllm.tool_parsers.abstract_tool_parser import (
    Tool,
    ToolParser,
)
from vllm.utils.mistral import is_mistral_tokenizer

if TYPE_CHECKING:
    from vllm.reasoning import ReasoningParser

logger = init_logger(__name__)

ALPHANUMERIC = ascii_letters + digits

_DEFAULT_JSON_SCHEMA = {"anyOf": [{"type": "object"}, {"type": "array"}]}


class StreamingState(Enum):
    """Enum for tracking the current streaming parsing state."""

    WAITING_FOR_TOOL_START = auto()
    WAITING_FOR_TOOL_KEY = (
        auto()
    )  # waiting for the "name" or "arguments" key to be complete
    PARSING_NAME = auto()
    PARSING_NAME_COMPLETED = auto()
    WAITING_FOR_ARGUMENTS_START = auto()
    PARSING_ARGUMENTS = auto()
    PARSING_ARGUMENTS_COMPLETED = auto()
    TOOL_COMPLETE = auto()
    ALL_TOOLS_COMPLETE = auto()


class MistralToolCall(ToolCall):
    id: str = Field(default_factory=lambda: MistralToolCall.generate_random_id())

    @staticmethod
    def generate_random_id():
        # Mistral Tool Call Ids must be alphanumeric with a length of 9.
        # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
        return "".join(choices(ALPHANUMERIC, k=9))

    @staticmethod
    def is_valid_id(id: str) -> bool:
        return id.isalnum() and len(id) == 9


def _is_pre_v11_tokeniser(model_tokenizer: TokenizerLike) -> bool:
    if is_mistral_tokenizer(model_tokenizer):
        return model_tokenizer.version < 11
    # For HF tokenizers, check if [ARGS] token exists in vocab
    # which indicates a v11+ equivalent tokenizer
    vocab: dict[str, int] = getattr(model_tokenizer, "get_vocab", lambda: {})()
    return "[ARGS]" not in vocab


@dataclass
class MistralStreamingResult:
    r"""Encapsulates the mutable state returned from
    `MistralToolParser.extract_maybe_reasoning_and_tool_streaming`.
    """

    delta_message: DeltaMessage | None
    reasoning_ended: bool
    tools_called: bool
    current_text: str
    current_token_ids: list[int]


class MistralToolParser(ToolParser):
    r"""Tool call parser for Mistral models, intended for use with either:

    - `mistral_common <https://github.com/mistralai/mistral-common/>`_
      (recommended)
    - the `examples/tool_chat_template_mistral.jinja` template.

    Used when `--enable-auto-tool-choice --tool-call-parser mistral` are all
    set.
    """

    IS_MISTRAL_TOOL_PARSER = True  # used by vllm.utils.mistral

    # Used to generate correct grammar in `adjust_request`
    model_can_reason: bool = False

    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
        super().__init__(tokenizer, tools)

        if not is_mistral_tokenizer(self.model_tokenizer):
            logger.info("Non-Mistral tokenizer detected when using a Mistral model...")

        # initialize properties used for state when parsing tool calls in
        # streaming mode
        self.prev_tool_call_arr: list[dict[str, Any]] = []
        self.current_tool_id: int = -1
        self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START

        # For streaming pre v11 tokenizer tool calls
        self.current_tool_name: str | None = None
        self.current_tool_mistral_id: str | None = None
        self.starting_new_tool = False
        self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer)
        if self._is_pre_v11:
            self.parse_coro = ijson.parse_coro(
                self.update_stream_state_pre_v11_tokenizer()
            )

        self.bot_token = "[TOOL_CALLS]"
        self.bot_token_id = self.vocab.get(self.bot_token)
        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)

        if self.bot_token_id is None:
            raise RuntimeError(
                "Mistral Tool Parser could not locate the tool call token in "
                "the tokenizer!"
            )

    def adjust_request(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> ChatCompletionRequest | ResponsesRequest:
        so_non_supported_attributes = [
            "regex",
            "choice",
            "grammar",
            # whitespace_pattern is not a constraint type but an option;
            # Mistral grammar factory does not support it.
            "whitespace_pattern",
            "structural_tag",
        ]
        any_so_non_supported_active = request.structured_outputs is not None and any(
            getattr(request.structured_outputs, attribute) is not None
            for attribute in so_non_supported_attributes
        )
        response_format_non_supported_active = (
            isinstance(request, ResponsesRequest)
            or request.response_format is not None
            and request.response_format.type == "structural_tag"
        )

        if (
            not is_mistral_tokenizer(self.model_tokenizer)
            or isinstance(request, ResponsesRequest)
            or not self.model_tokenizer.supports_grammar
            or any_so_non_supported_active
            or response_format_non_supported_active
        ):
            request = super().adjust_request(request)
            if request.tools and request.tool_choice != "none":
                # Do not skip special tokens when using chat template
                # with Mistral parser as TOOL_CALL token is needed
                # for tool detection.
                # Note: we don't want skip_special_tokens=False
                # with MistralTokenizer as it is incompatible
                request.skip_special_tokens = False
            return request

        json_schema: dict[str, Any] | None = None
        if request.structured_outputs is not None:
            if request.structured_outputs.json_object is not None:
                json_schema = _DEFAULT_JSON_SCHEMA
            elif request.structured_outputs.json is not None:
                if isinstance(request.structured_outputs.json, str):
                    json_schema = json.loads(request.structured_outputs.json)
                else:
                    json_schema = request.structured_outputs.json
            else:
                raise ValueError(
                    "Unsupported request.structured_outputs for MistralToolParser. "
                    "Only `json` and `json_object` are supported."
                )
        elif (
            request.response_format is not None
            and request.response_format.type != "text"
        ):
            if request.response_format.type == "json_object":
                json_schema = _DEFAULT_JSON_SCHEMA
            elif request.response_format.type == "json_schema":
                if request.response_format.json_schema is not None:
                    json_schema = request.response_format.json_schema.json_schema
                else:
                    json_schema = _DEFAULT_JSON_SCHEMA
            else:
                raise ValueError(
                    "MistralToolParser only accepts `text`, `json_object` or "
                    f"`json_schema`, got {request.response_format=}"
                )
            # Structured Outputs will be defined.
            request.response_format = None

        grammar_factory = self.model_tokenizer.grammar_factory

        # TODO: Once unified parser, improve this.
        # The issue is figuring out when a model is a reasoning one or not.
        template = grammar_factory.select_jinja_template(
            reasoning=self.model_can_reason
        )

        mistral_tools = (
            [
                MistralTool.model_validate(
                    adapt_inplace_to_mistral_tool(tool.model_dump())
                )
                for tool in request.tools
            ]
            if request.tools is not None
            else None
        )

        tool_choice: MistralToolChoice
        match request.tool_choice:
            case "none" | "auto" | "required":
                tool_choice = MistralToolChoiceEnum(request.tool_choice)
            case None:
                tool_choice = MistralToolChoiceEnum.auto
            # _ == Named tool choice
            case _:
                tool_choice = MistralNamedToolChoice.model_validate(
                    {
                        "type": "function",
                        "function": {"name": request.tool_choice.function.name},
                    }
                )

        # Rendering grammar is cached in mistral-common given tools, template and mode.
        match tool_choice, json_schema is not None:
            case MistralToolChoiceEnum.none, True:
                lark_grammar = grammar_factory.get_lark_for_json_schema(
                    template=template, json_schema=json_schema
                )
            case _, _:
                lark_grammar = grammar_factory.get_lark_from_jinja(
                    template=template,
                    mode=tool_choice,
                    tools=mistral_tools,
                    json_schema=json_schema,
                    parallel_tool_calls=request.parallel_tool_calls,
                    json_only=False,
                )

        request.structured_outputs = StructuredOutputsParams(grammar=lark_grammar)
        request._grammar_from_tool_parser = True
        return request

    def extract_maybe_reasoning_and_tool_streaming(
        self,
        *,
        reasoning_parser: ReasoningParser | None,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: list[int],
        current_token_ids: list[int],
        output_token_ids: Sequence[int],
        reasoning_ended: bool,
        prompt_is_reasoning_end: bool | None,
        request: ChatCompletionRequest,
    ) -> MistralStreamingResult:
        r"""Streaming extraction with reasoning followed by tool-call parsing.

        This method encapsulates the combined reasoning extraction and
        tool-call streaming logic so that the serving layer only needs a
        thin routing branch.

        The flow is:

        1. If a *reasoning_parser* is present and reasoning has **not** ended,
           extract reasoning tokens.  Pre-v15 models may have pre-filled
           `[THINK]...[/THINK]` in system prompts, so we skip the
           prompt-level reasoning-end check for those.
        2. Once reasoning ends (or if there is no reasoning parser), delegate
           to `extract_tool_calls_streaming` and track whether tools were
           called.

        Args:
            reasoning_parser: Optional reasoning parser instance.
            previous_text: Accumulated text from prior chunks.
            current_text: Full accumulated text including current chunk.
            delta_text: New text in this chunk.
            previous_token_ids: Token ids from prior chunks.
            current_token_ids: Full token ids including current chunk.
            output_token_ids: Raw output token ids from the engine.
            reasoning_ended: Whether reasoning has already ended.
            prompt_is_reasoning_end: Whether the prompt itself ends reasoning.
            request: The originating chat completion request.
        """
        delta_message: DeltaMessage | None = None
        tools_called = False
        reasoning_ended_at_entry = reasoning_ended

        # For MistralReasoningParser, only enter the reasoning block when
        # the model has actually emitted a [THINK] token.  Other reasoning
        # parsers always expect thinking to be present.
        expect_thinking = (
            not isinstance(reasoning_parser, MistralReasoningParser)
            or reasoning_parser.start_token_id in current_token_ids
        )
        if reasoning_parser is not None and not reasoning_ended and expect_thinking:
            # Pre-v15 models may have pre-filled [THINK]...[/THINK] in
            # system prompts, so skip the prompt-level reasoning-end
            # check and wait for the output's own end-of-think.
            is_pre_v15 = (
                isinstance(self.model_tokenizer, MistralTokenizer)
                and self.model_tokenizer.version < 15
            )

            if not is_pre_v15 and prompt_is_reasoning_end:
                reasoning_ended = True
                current_token_ids = list(output_token_ids)
            else:
                delta_message = reasoning_parser.extract_reasoning_streaming(
                    previous_text,
                    current_text,
                    delta_text,
                    previous_token_ids,
                    current_token_ids,
                    output_token_ids,
                )
                if reasoning_parser.is_reasoning_end_streaming(
                    current_token_ids, output_token_ids
                ):
                    reasoning_ended = True
                    current_token_ids = reasoning_parser.extract_content_ids(
                        list(output_token_ids)
                    )
                    if delta_message and delta_message.content:
                        current_text = delta_message.content
                        delta_message.content = None
                    else:
                        current_text = ""

            if not reasoning_ended:
                return MistralStreamingResult(
                    delta_message=delta_message,
                    reasoning_ended=False,
                    tools_called=False,
                    current_text=current_text,
                    current_token_ids=current_token_ids,
                )

        delta_token_ids = list(output_token_ids)

        # On the iteration where reasoning just ended, reset the text/token
        # state so the tool parser sees a clean history instead of the
        # accumulated reasoning text.
        if not reasoning_ended_at_entry and reasoning_ended:
            previous_text = ""
            previous_token_ids = []
            delta_text = current_text
            delta_token_ids = current_token_ids

        delta_message = self.extract_tool_calls_streaming(
            previous_text=previous_text,
            current_text=current_text,
            delta_text=delta_text,
            previous_token_ids=previous_token_ids,
            current_token_ids=current_token_ids,
            delta_token_ids=delta_token_ids,
            request=request,
        )
        if delta_message and delta_message.tool_calls:
            tools_called = True

        return MistralStreamingResult(
            delta_message=delta_message,
            reasoning_ended=reasoning_ended,
            tools_called=tools_called,
            current_text=current_text,
            current_token_ids=current_token_ids,
        )

    @staticmethod
    def build_non_streaming_tool_calls(
        tool_calls: list[FunctionCall] | None,
    ) -> list[ToolCall]:
        r"""Build `MistralToolCall` items for non-streaming responses."""
        if not tool_calls:
            return []

        return [
            MistralToolCall(id=tc.id, function=tc)
            if tc.id
            else MistralToolCall(function=tc)
            for tc in tool_calls
        ]

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        """
        Extract the tool calls from a complete model response.

        Content and tool calls formatting depends on the Mistral's tokenizer version
        used to train the model:

        - < v11: `content[BOT] [{tool_call1},{tool_call2}]`
        - >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}`

        with [BOT] the tool call token.

        Note:
            For tokenizer versions >= v11, tool calls with arguments wrongly formatted
            are still returned as tool calls. This is to allow the model to know it
            tried to make a tool call. It reduces chance of another failure and
            prevents that the context is filled with tool calls wrongly placed in
            assistant message contents.
        """

        # If the tool call token is not present, return a text response
        if self.bot_token not in model_output:
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )

        content_and_raw_tool_calls = model_output.split(self.bot_token)
        content = content_and_raw_tool_calls[0]
        raw_tool_calls = content_and_raw_tool_calls[1:]

        # >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}
        if not self._is_pre_v11:
            tool_calls = []
            for raw_tool_call in raw_tool_calls:
                if "{" not in raw_tool_call:
                    continue

                end_name = raw_tool_call.find("{")
                tool_name, args = (
                    raw_tool_call[:end_name],
                    raw_tool_call[end_name:],
                )

                # HF tokenizers may include [ARGS] in the text
                tool_name = tool_name.replace("[ARGS]", "")
                tool_calls.append({"name": tool_name, "arguments": args})

        # < v11: content[BOT] [{tool_call1},{tool_call2}]
        else:
            if len(raw_tool_calls) != 1:
                raise ValueError(
                    "Only one BOT token should have been outputted, "
                    f"but got {model_output}."
                )
            stringified_tool_calls = raw_tool_calls[0].strip()
            try:
                # Use raw_decode to parse the first valid JSON value,
                # ignoring trailing tokens the model may emit after
                # the tool call array.
                tool_calls, _ = json.JSONDecoder().raw_decode(stringified_tool_calls)
            except json.JSONDecodeError:
                try:
                    raw_tool_call = self.tool_call_regex.findall(
                        stringified_tool_calls
                    )[0]
                    tool_calls = json.loads(raw_tool_call)
                    tool_calls = [
                        {
                            "name": tool_call["name"],
                            "arguments": json.dumps(
                                tool_call.get("arguments", {}),
                                ensure_ascii=False,
                            ),
                        }
                        for tool_call in tool_calls
                    ]
                except (IndexError, json.JSONDecodeError):
                    logger.exception("Error in extracting tool call from response.")
                    return ExtractedToolCallInformation(
                        tools_called=False,
                        tool_calls=[],
                        content=stringified_tool_calls,
                    )
            else:
                tool_calls = [
                    {
                        "name": tool_call["name"],
                        "arguments": json.dumps(
                            tool_call.get("arguments", {}),
                            ensure_ascii=False,
                        ),
                    }
                    for tool_call in tool_calls
                ]

        mistral_tool_calls: list[MistralToolCall] = [
            MistralToolCall(
                type="function",
                function=FunctionCall(
                    name=tool_call["name"],
                    arguments=tool_call.get("arguments", "{}"),
                ),
            )
            for tool_call in tool_calls
        ]

        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=mistral_tool_calls,
            content=content if len(content) > 0 else None,
        )

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> DeltaMessage | None:
        has_bot_token = (
            self.bot_token_id in current_token_ids or self.bot_token in current_text
        )
        if not has_bot_token:
            # if the tool call token is not in the tokens generated so far,
            # append output to contents since it's not a tool
            return DeltaMessage(content=delta_text)

        # if the tool call token IS in the tokens generated so far, that
        # means we're parsing as tool calls now
        try:
            if self._is_pre_v11:
                return self._extract_tool_calls_streaming_pre_v11_tokenizer(
                    delta_text=delta_text,
                    delta_token_ids=delta_token_ids,
                )
            else:
                return self._extract_tool_calls_streaming(
                    delta_text=delta_text, delta_token_ids=delta_token_ids
                )
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            return None

    def _extract_tool_calls_streaming(
        self,
        delta_text: str,
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extracts tool calls for Mistral models
        doing tool calls of the following format:
        `[TOOL_CALLS]add{"a": 3.5, "b": 4}`
        """
        additional_content: str = ""
        if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
            # this is the first tool call
            if self.bot_token not in delta_text:
                return DeltaMessage(content=delta_text)
            if not delta_text.startswith(self.bot_token):
                additional_content += delta_text.split(self.bot_token)[0]
                delta_text = self.bot_token + "".join(
                    delta_text.split(self.bot_token)[1:]
                )

        delta_tool_calls = self._generate_delta_tool_call(delta_text)
        if not additional_content and len(delta_tool_calls) == 0:
            if self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS,
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
                StreamingState.TOOL_COMPLETE,
                StreamingState.ALL_TOOLS_COMPLETE,
            ]:
                # Return an empty DeltaMessage once the tool calls are all done
                # so that finish_reason gets set.
                return DeltaMessage()
            else:
                # return None when the tool is not likely to be finished
                # This can occur when the name is being parsed for example
                # and we wait for the name to be complete
                # before sending the function name
                return None

        delta = DeltaMessage()
        if additional_content:
            delta.content = additional_content
        if len(delta_tool_calls) > 0:
            delta.tool_calls = delta_tool_calls

        # HACK: serving_chat.py inspects the internal state of tool parsers
        # when determining its final streaming delta, automatically
        # adding autocompleted JSON.
        # These two lines avoid that nonsense while ensuring finish_reason
        # is set to tool_calls when at least one tool is called.
        if delta_tool_calls and not self.prev_tool_call_arr:
            self.prev_tool_call_arr = [{"arguments": {}}]
        return delta

    def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
        if delta_text == "" or delta_text is None:
            return []
        delta_function_name = None
        tool_id = None
        if self.streaming_state not in [
            StreamingState.PARSING_NAME,
            StreamingState.PARSING_ARGUMENTS,
        ] and delta_text.startswith(self.bot_token):
            self.current_tool_id += 1
            self.streaming_state = StreamingState.PARSING_NAME
            delta_text = delta_text.replace(self.bot_token, "", 1)
        if self.streaming_state == StreamingState.PARSING_NAME:
            if self.current_tool_name is None:
                self.current_tool_name = ""
            # The name stops where the arguments start
            # And the arguments start with the `{` char
            if "{" in delta_text:
                tool_id = MistralToolCall.generate_random_id()
                delta_function_name = delta_text.split("{")[0]
                self.current_tool_name += delta_function_name
                # HF tokenizers may include [ARGS] in the text
                self.current_tool_name = self.current_tool_name.replace("[ARGS]", "")
                delta_text = delta_text[len(delta_function_name) :]
                self.streaming_state = StreamingState.PARSING_ARGUMENTS
            else:
                # we want to send the tool name once it's complete
                self.current_tool_name += delta_text
                return []
        if self.streaming_state == StreamingState.PARSING_ARGUMENTS:
            next_function_text = None
            if self.bot_token in delta_text:
                # current tool call is over
                delta_arguments = ""
                delta_arguments += delta_text.split(self.bot_token)[0]
                next_function_text = delta_text[len(delta_arguments) :]
                self.streaming_state = StreamingState.TOOL_COMPLETE
            else:
                delta_arguments = delta_text
            ret = []
            if self.current_tool_name or delta_arguments:
                ret += [
                    DeltaToolCall(
                        index=self.current_tool_id,
                        type="function",
                        id=tool_id,
                        function=DeltaFunctionCall(
                            name=self.current_tool_name, arguments=delta_arguments
                        ).model_dump(exclude_none=True),
                    )
                ]
                self.current_tool_name = None
            if next_function_text:
                ret += self._generate_delta_tool_call(next_function_text)
            return ret
        # Should not happen
        return []

    @ijson.coroutine
    def update_stream_state_pre_v11_tokenizer(self):
        while True:
            (prefix, event, value) = yield

            if prefix == "item" and event == "start_map":
                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
            if prefix == "item" and event == "map_key" and value == "name":
                self.streaming_state = StreamingState.PARSING_NAME
            if prefix == "item.name" and event == "string":
                self.current_tool_name = value
                self.streaming_state = StreamingState.PARSING_NAME_COMPLETED
            if prefix == "item" and event == "map_key" and value == "arguments":
                self.streaming_state = StreamingState.WAITING_FOR_ARGUMENTS_START
            if prefix == "item.arguments" and event == "start_map":
                self.streaming_state = StreamingState.PARSING_ARGUMENTS
            if prefix == "item.arguments" and event == "end_map":
                self.streaming_state = StreamingState.PARSING_ARGUMENTS_COMPLETED
            if prefix == "item" and event == "end_map":
                self.streaming_state = StreamingState.TOOL_COMPLETE
            if prefix == "" and event == "end_array":
                self.streaming_state = StreamingState.ALL_TOOLS_COMPLETE

    def _extract_tool_calls_streaming_pre_v11_tokenizer(
        self,
        delta_text: str,
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extracts tool calls for Mistral models
        doing tool calls of the following format:
        `[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}`
        """
        assert self.parse_coro is not None
        content = None
        delta_tool_calls: list[DeltaToolCall] = []
        current_tool_call: DeltaToolCall = DeltaToolCall(
            index=self.current_tool_id, type="function"
        )
        current_tool_call_modified = False
        if self.bot_token_id in delta_token_ids or self.bot_token in delta_text:
            # this is the first tool call
            if not delta_text.startswith(self.bot_token):
                content = delta_text.split(self.bot_token)[0]
            delta_text = "".join(delta_text.split(self.bot_token)[1:])

        # Cut smartly the delta text to catch the ijson events
        # as ijson does not give us the index in the text at each event.
        # We need to cut so that we know
        # where in the text the events are emitted from.
        while len(delta_text) > 0:
            streaming_state_before_parse = self.streaming_state

            if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                )
            elif self.streaming_state == StreamingState.WAITING_FOR_TOOL_KEY:
                # Wait until another key is sent
                # or the current tool is completed
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_colon=1,
                    stop_after_opening_curly_braces=1,
                    # if the tool ends, we want to separate
                    # at the start of the next tool
                )
            elif self.streaming_state == StreamingState.PARSING_NAME:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_comma=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.WAITING_FOR_ARGUMENTS_START:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                )
            elif self.streaming_state == StreamingState.PARSING_ARGUMENTS:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_closing_curly_braces=1,
                    # we could be more clever
                    # by listening to item.arguments.* start_map events
                    # and know how many curly braces we can allow
                )
            elif self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
                StreamingState.PARSING_NAME_COMPLETED,
            ]:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_closing_curly_braces=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.TOOL_COMPLETE:
                delta_to_be_parsed, delta_text = self._split_delta(
                    delta_text=delta_text,
                    stop_after_opening_curly_braces=1,
                    stop_after_closing_brackets=1,
                )
            elif self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
                content = delta_text
                delta_text = ""
            else:
                delta_to_be_parsed = delta_text
                delta_text = ""

            if self.streaming_state != StreamingState.ALL_TOOLS_COMPLETE:
                self.parse_coro.send(delta_to_be_parsed.encode("utf-8"))

            # Given the parsed text and the possible streaming state change,
            # let's add to the tool delta
            if (
                (streaming_state_before_parse != self.streaming_state)
                and streaming_state_before_parse
                in [StreamingState.WAITING_FOR_TOOL_START, StreamingState.TOOL_COMPLETE]
                and self.streaming_state
                not in [
                    StreamingState.ALL_TOOLS_COMPLETE,
                    StreamingState.TOOL_COMPLETE,
                    StreamingState.WAITING_FOR_TOOL_START,
                ]
            ):
                # starting a new tool call
                if current_tool_call_modified:
                    if self.current_tool_mistral_id is not None:
                        current_tool_call.id = self.current_tool_mistral_id
                        self.current_tool_mistral_id = None
                    delta_tool_calls.append(current_tool_call)
                current_tool_call_modified = False
                self.current_tool_id += 1
                self.current_tool_mistral_id = MistralToolCall.generate_random_id()
                current_tool_call = DeltaToolCall(
                    index=self.current_tool_id,
                    type="function",
                )
            if current_tool_call.function is None:
                current_tool_call.function = DeltaFunctionCall()

            if self.current_tool_name is not None:
                # we have the complete tool name
                current_tool_call_modified = True
                current_tool_call.function.name = self.current_tool_name
                self.current_tool_name = None
            if self.streaming_state == StreamingState.PARSING_NAME_COMPLETED:
                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
            if self.streaming_state in [
                StreamingState.PARSING_ARGUMENTS,
                StreamingState.PARSING_ARGUMENTS_COMPLETED,
            ]:
                if self.streaming_state == StreamingState.PARSING_ARGUMENTS_COMPLETED:
                    self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
                # the delta_to_be_parsed is part of arguments.
                current_tool_call_modified = True
                if current_tool_call.function.arguments is None:
                    current_tool_call.function.arguments = delta_to_be_parsed
                else:
                    current_tool_call.function.arguments += delta_to_be_parsed
                if streaming_state_before_parse != StreamingState.PARSING_ARGUMENTS:
                    # It's the first chunk of arg. let's lstrip it
                    current_tool_call.function.arguments = (
                        current_tool_call.function.arguments.lstrip()
                    )

        if current_tool_call_modified:
            if self.current_tool_mistral_id is not None:
                current_tool_call.id = self.current_tool_mistral_id
                self.current_tool_mistral_id = None
            delta_tool_calls.append(current_tool_call)

        # HACK: serving_chat.py inspects the internal state of tool parsers
        # when determining it's final streaming delta, automatically
        # adding autocompleted JSON.
        # These two lines avoid that nonsense while ensuring finish_reason
        # is set to tool_calls when at least one tool is called.
        if delta_tool_calls and not self.prev_tool_call_arr:
            self.prev_tool_call_arr = [{"arguments": {}}]

        if content or len(delta_tool_calls) > 0:
            delta_message = DeltaMessage()
            if content:
                delta_message.content = content
            if len(delta_tool_calls) > 0:
                delta_message.tool_calls = delta_tool_calls
            return delta_message
        else:
            if self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
                return DeltaMessage()
            else:
                return None

    def _split_delta(
        self,
        delta_text: str,
        stop_after_quotes: int = -1,
        stop_after_opening_curly_braces: int = -1,
        stop_after_closing_curly_braces: int = -1,
        stop_after_closing_brackets: int = -1,
        stop_after_colon: int = -1,
        stop_after_comma=-1,
    ) -> tuple[str, str]:
        delta_to_be_parsed = ""
        for i, c in enumerate(delta_text):
            if c in ['"', "'"]:
                delta_to_be_parsed += c
                stop_after_quotes -= 1
                if stop_after_quotes == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "{":
                delta_to_be_parsed += c
                stop_after_opening_curly_braces -= 1
                if stop_after_opening_curly_braces == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "}":
                delta_to_be_parsed += c
                stop_after_closing_curly_braces -= 1
                if stop_after_closing_curly_braces == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == "]":
                delta_to_be_parsed += c
                stop_after_closing_brackets -= 1
                if stop_after_closing_brackets == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == ":":
                delta_to_be_parsed += c
                stop_after_colon -= 1
                if stop_after_colon == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            elif c == ",":
                delta_to_be_parsed += c
                stop_after_comma -= 1
                if stop_after_comma == 0:
                    return (delta_to_be_parsed, delta_text[i + 1 :])
            else:
                delta_to_be_parsed += c

        return (delta_to_be_parsed, "")