[BUG]: fix HF tokenizer concurrent borrow in tool parsers (#40059)

Signed-off-by: Yifan <yzong@redhat.com> Co-authored-by: timon0305 <timon0305@outlook.com> Co-authored-by: sfeng33 <4florafeng@gmail.com>

[BUG]: fix HF tokenizer concurrent borrow in tool parsers (#40059)
Signed-off-by: Yifan <yzong@redhat.com> Co-authored-by: timon0305 <timon0305@outlook.com> Co-authored-by: sfeng33 <4florafeng@gmail.com>
c9bf77df · yzong-rh · GitHub · 30413442 · c9bf77df · c9bf77df
Unverified Commit c9bf77df authored Apr 23, 2026 by yzong-rh Committed by GitHub Apr 23, 2026
3 changed files
--- a/tests/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/tool_parsers/test_llama3_json_tool_parser.py
@@ -4,15 +4,22 @@
 from unittest.mock import MagicMock, patch

 import pytest
+from transformers import AutoTokenizer

 from vllm.entrypoints.openai.engine.protocol import ExtractedToolCallInformation
-from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser

+LLAMA_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def llama_tokenizer():
+    return AutoTokenizer.from_pretrained(LLAMA_MODEL)
+

 @pytest.fixture
-def parser(default_tokenizer: TokenizerLike):
-    return Llama3JsonToolParser(default_tokenizer)
+def parser(llama_tokenizer):
+    return Llama3JsonToolParser(llama_tokenizer)


 def test_extract_tool_calls_simple(parser):

--- a/vllm/tool_parsers/functiongemma_tool_parser.py
+++ b/vllm/tool_parsers/functiongemma_tool_parser.py
@@ -34,41 +34,29 @@ class FunctionGemmaToolParser(ToolParser):
    <start_function_call>call:func_name{param:<escape>value<escape>}<end_function_call>
    """

-    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
-        super().__init__(tokenizer, tools)
-
-        # Streaming state
-        self.current_tool_name_sent: bool = False
-        self.prev_tool_call_arr: list[dict] = []
-        self.current_tool_id: int = -1
-        self.streamed_args_for_tool: list[str] = []
-
    # FunctionGemma tokens
-        self.tool_call_start_token: str = "<start_function_call>"
-        self.tool_call_end_token: str = "<end_function_call>"
+    tool_call_start_token: str = "<start_function_call>"
+    tool_call_end_token: str = "<end_function_call>"

    # Regex patterns
-        self.tool_call_regex = re.compile(
+    tool_call_regex: re.Pattern = re.compile(
        r"<start_function_call>call:(\w+)\{(.*?)\}<end_function_call>"
        r"|<start_function_call>call:(\w+)\{(.*)",
        re.DOTALL,
    )
-        self.arg_regex = re.compile(
+    arg_regex: re.Pattern = re.compile(
        r"(\w+):<escape>(.*?)<escape>",
        re.DOTALL,
    )

-        if self.model_tokenizer:
-            self.tool_call_start_token_ids = self.model_tokenizer.encode(
-                self.tool_call_start_token, add_special_tokens=False
-            )
-            self.tool_call_end_token_ids = self.model_tokenizer.encode(
-                self.tool_call_end_token, add_special_tokens=False
-            )
-        else:
-            self.tool_call_start_token_ids = []
-            self.tool_call_end_token_ids = []
+    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
+        super().__init__(tokenizer, tools)

+        # Streaming state
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = []
        self.buffered_delta_text = ""

    def _parse_arguments(self, args_str: str) -> dict:

--- a/vllm/tool_parsers/llama_tool_parser.py
+++ b/vllm/tool_parsers/llama_tool_parser.py
@@ -45,6 +45,12 @@ class Llama3JsonToolParser(ToolParser):
    llama4_json are set.
    """

+    bot_token: str = "<|python_tag|>"
+    # Simple regex to find opening braces - we'll use JSON decoder for parsing
+    # This handles arbitrary nesting depth correctly
+    tool_call_start_regex: re.Pattern = re.compile(r"\{")
+    json_decoder: json.JSONDecoder = json.JSONDecoder()
+
    def __init__(
        self,
        tokenizer: PreTrainedTokenizerBase,
@@ -60,14 +66,12 @@ class Llama3JsonToolParser(ToolParser):
        self.streamed_args_for_tool: list[
            str
        ] = []  # map what has been streamed for each tool so far to a list
-        self.bot_token = "<|python_tag|>"
-        self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[
-            0
-        ]
-        # Simple regex to find opening braces - we'll use JSON decoder for parsing
-        # This handles arbitrary nesting depth correctly
-        self.tool_call_start_regex = re.compile(r"\{")
-        self.json_decoder = json.JSONDecoder()
+        self.bot_token_id = self.vocab.get(self.bot_token)
+        if self.bot_token_id is None:
+            raise RuntimeError(
+                "Llama3JsonToolParser could not locate the bot token "
+                f"'{self.bot_token}' in the tokenizer."
+            )

    def extract_tool_calls(
        self, model_output: str, request: ChatCompletionRequest