[BugFix] Support custom tool parsers when tool_choice is `required` and named function (#39870)

Signed-off-by: JaredforReal <w13431838023@gmail.com> Signed-off-by: sfeng33 <4florafeng@gmail.com> Co-authored-by: sfeng33 <4florafeng@gmail.com>

[BugFix] Support custom tool parsers when tool_choice is `required` and named function (#39870)
Signed-off-by: JaredforReal <w13431838023@gmail.com> Signed-off-by: sfeng33 <4florafeng@gmail.com> Co-authored-by: sfeng33 <4florafeng@gmail.com>
ceade195 · Jared Wen · GitHub · 747256bb · ceade195 · ceade195
Unverified Commit ceade195 authored Apr 18, 2026 by Jared Wen Committed by GitHub Apr 17, 2026
5 changed files
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -557,6 +557,20 @@ class OpenAIServingChat(OpenAIServing):
            and self._should_stream_with_auto_tool_parsing(request)
        )

+        # Determine whether required/named tool_choice should fall back to
+        # the auto tool_parser path instead of the standard JSON-based parsing.
+        # This happens when the parser declares supports_required_and_named=False
+        # (e.g. GLM models that output XML instead of JSON).
+        tool_choice_uses_parser = (
+            self.tool_parser is not None
+            and not self.tool_parser.supports_required_and_named
+            and request.tools
+            and (
+                request.tool_choice == "required"
+                or isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
+            )
+        )
+
        all_previous_token_ids: list[list[int]] | None
        function_name_returned = [False] * num_choices
        if self.tool_call_id_type == "kimi_k2":
@@ -569,7 +583,12 @@ class OpenAIServingChat(OpenAIServing):

        # Only one of these will be used, thus previous_texts and
        # all_previous_token_ids will not be used twice in the same iteration.
-        if is_mistral_grammar_path or tool_choice_auto or reasoning_parser:
+        if (
+            is_mistral_grammar_path
+            or tool_choice_auto
+            or tool_choice_uses_parser
+            or reasoning_parser
+        ):
            # These are only required in "auto" tool choice case
            all_previous_token_ids = [[] for _ in range(num_choices)]
            reasoning_end_arr = [False] * num_choices
@@ -764,7 +783,12 @@ class OpenAIServingChat(OpenAIServing):
                    delta_message: DeltaMessage | None

                    # just update previous_texts and previous_token_ids
-                    if is_mistral_grammar_path or tool_choice_auto or reasoning_parser:
+                    if (
+                        is_mistral_grammar_path
+                        or tool_choice_auto
+                        or tool_choice_uses_parser
+                        or reasoning_parser
+                    ):
                        assert previous_texts is not None
                        assert all_previous_token_ids is not None
                        previous_text = previous_texts[i]
@@ -813,7 +837,9 @@ class OpenAIServingChat(OpenAIServing):
                        if result.tools_called:
                            tools_streamed[i] = True
                    # handle streaming deltas for tools with named tool_choice
-                    elif tool_choice_function_name:
+                    # Skip when tool_choice_uses_parser so it falls through
+                    # to the auto tool_parser branches below.
+                    elif tool_choice_function_name and not tool_choice_uses_parser:
                        # When encountering think end id in prompt_token_ids
                        # i.e {"enable_thinking": False},
                        # check BEFORE calling the parser to avoid a spurious
@@ -851,7 +877,6 @@ class OpenAIServingChat(OpenAIServing):
                            ):
                                reasoning_end_arr[i] = True
                                if delta_message and delta_message.content:
-                                    # This need to be added to next `delta_text`
                                    current_text = delta_message.content
                                    delta_message.content = None
                                else:
@@ -896,7 +921,12 @@ class OpenAIServingChat(OpenAIServing):
                            )
                            tools_streamed[i] = True

-                    elif request.tool_choice == "required":
+                    # Skip when tool_choice_uses_parser so it falls through
+                    # to the auto tool_parser branches below.
+                    elif (
+                        request.tool_choice == "required"
+                        and not tool_choice_uses_parser
+                    ):
                        assert previous_texts is not None
                        previous_text = previous_texts[i]
                        current_text = previous_text + delta_text
@@ -966,7 +996,10 @@ class OpenAIServingChat(OpenAIServing):

                    # update the previous values for the next iteration
                    if (
-                        is_mistral_grammar_path or tool_choice_auto or reasoning_parser
+                        is_mistral_grammar_path
+                        or tool_choice_auto
+                        or tool_choice_uses_parser
+                        or reasoning_parser
                    ) and not self.use_harmony:
                        assert previous_texts is not None
                        assert all_previous_token_ids is not None

--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -627,7 +627,7 @@ class OpenAIServing:
            and isinstance(request.tool_choice, ToolChoiceFunction)
        ):
            assert content is not None
-            # Forced Function Call
+            # Forced Function Call (Responses API)
            function_calls.append(
                FunctionCall(name=request.tool_choice.name, arguments=content)
            )
@@ -636,14 +636,20 @@ class OpenAIServing:
            not use_mistral_tool_parser
            and request.tool_choice
            and isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
+            and (tool_parser_cls is None or tool_parser_cls.supports_required_and_named)
        ):
+            # Named function with standard JSON-based parsing
            assert content is not None
-            # Forced Function Call
            function_calls.append(
                FunctionCall(name=request.tool_choice.function.name, arguments=content)
            )
            content = None  # Clear content since tool is called.
-        elif not use_mistral_tool_parser and request.tool_choice == "required":
+        elif (
+            not use_mistral_tool_parser
+            and request.tool_choice == "required"
+            and (tool_parser_cls is None or tool_parser_cls.supports_required_and_named)
+        ):
+            # "required" with standard JSON-based parsing
            tool_calls = []
            with contextlib.suppress(ValidationError):
                content = content or ""
@@ -662,15 +668,30 @@ class OpenAIServing:
            use_mistral_tool_parser
            or (
                enable_auto_tools
-                and (request.tool_choice == "auto" or request.tool_choice is None)
+                and (
+                    request.tool_choice == "auto"
+                    or request.tool_choice is None
+                    or (
+                        not tool_parser_cls.supports_required_and_named
+                        and request.tools
+                        and (
+                            request.tool_choice == "required"
+                            or isinstance(
+                                request.tool_choice,
+                                ChatCompletionNamedToolChoiceParam,
+                            )
+                        )
+                    )
+                )
            )
        ):
+            # Automatic Tool Call Parsing (also used as fallback for
+            # required/named when supports_required_and_named=False)
            if tokenizer is None:
                raise ValueError(
                    "Tokenizer not available when `skip_tokenizer_init=True`"
                )

-            # Automatic Tool Call Parsing
            try:
                tool_parser = tool_parser_cls(tokenizer, request.tools)
            except RuntimeError as e:

--- a/vllm/tool_parsers/abstract_tool_parser.py
+++ b/vllm/tool_parsers/abstract_tool_parser.py
@@ -44,6 +44,17 @@ class ToolParser:
    derived classes.
    """

+    # When True (default), the serving layer uses the standard JSON-based
+    # parsing for tool_choice="required" and named function tool_choice,
+    # which works for models where guided decoding produces well-formed
+    # JSON output (e.g. Hermes).
+    # Subclasses set False when the standard parsing does not work for
+    # their model's output format (e.g. GLM models that use XML).  When
+    # False, the serving layer falls back to the tool_parser's
+    # extract_tool_calls / extract_tool_calls_streaming methods for
+    # required/named tool_choice, treating them the same as "auto".
+    supports_required_and_named: bool = True
+
    def __init__(
        self,
        tokenizer: TokenizerLike,

--- a/vllm/tool_parsers/glm47_moe_tool_parser.py
+++ b/vllm/tool_parsers/glm47_moe_tool_parser.py
@@ -23,6 +23,8 @@ logger = init_logger(__name__)


 class Glm47MoeModelToolParser(Glm4MoeModelToolParser):
+    supports_required_and_named = False
+
    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
        super().__init__(tokenizer, tools)
        # GLM-4.7 format: <tool_call>func_name[<arg_key>...]*</tool_call>

--- a/vllm/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/tool_parsers/glm4_moe_tool_parser.py
@@ -20,6 +20,7 @@ import regex as re

 from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedToolChoiceParam,
    ChatCompletionRequest,
 )
 from vllm.entrypoints.openai.engine.protocol import (
@@ -50,6 +51,8 @@ class Glm4MoeModelToolParser(ToolParser):
    call, and diffs against what was previously sent to emit only new content.
    """

+    supports_required_and_named = False
+
    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
        super().__init__(tokenizer, tools)
        # Stateful streaming fields
@@ -156,7 +159,25 @@ class Glm4MoeModelToolParser(ToolParser):
    def adjust_request(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> ChatCompletionRequest | ResponsesRequest:
-        """Adjust request parameters for tool call token handling."""
+        """Adjust request parameters for tool call token handling.
+
+        For required/named tool_choice, skip setting structured_outputs
+        because GLM models output tool calls in XML format (per chat
+        template).  Guided decoding would force JSON output, conflicting
+        with the XML format and causing parsing failures.
+        """
+        if request.tools:
+            tc = request.tool_choice
+            if tc == "required" or isinstance(tc, ChatCompletionNamedToolChoiceParam):
+                # Do NOT call super().adjust_request() for required/named,
+                # because it would set structured_outputs and force JSON
+                # output via guided decoding.  GLM models use XML tool-call
+                # syntax (defined in the chat template), so guided decoding
+                # must be skipped to let the model output XML freely.
+                # The tool_parser handles extraction from XML output.
+                if request.tool_choice != "none":
+                    request.skip_special_tokens = False
+                return request
        request = super().adjust_request(request)
        if request.tools and request.tool_choice != "none":
            # Ensure tool call tokens (<tool_call>, </tool_call>) are not skipped