[Refactor][Parser] Migrate chat completion auto-tool/reasoning/plain streaming...

[Refactor][Parser] Migrate chat completion auto-tool/reasoning/plain streaming to parse_delta (#39446) Signed-off-by: sfeng33 <4florafeng@gmail.com>

[Refactor][Parser] Migrate chat completion auto-tool/reasoning/plain streaming...
[Refactor][Parser] Migrate chat completion auto-tool/reasoning/plain streaming to parse_delta (#39446) Signed-off-by: sfeng33 <4florafeng@gmail.com>
db8a6d66 · Flora Feng · GitHub · d2130a47 · db8a6d66 · db8a6d66
Unverified Commit db8a6d66 authored Apr 14, 2026 by Flora Feng Committed by GitHub Apr 14, 2026
Showing with 35 additions and 116 deletions

vllm/entrypoints/openai/chat_completion/serving.py vllm/entrypoints/openai/chat_completion/serving.py +29 -114

vllm/parser/abstract_parser.py vllm/parser/abstract_parser.py +6 -2

No files found.
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -68,11 +68,11 @@ from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.parser import ParserManager
+from vllm.parser.abstract_parser import Parser
 from vllm.reasoning import ReasoningParser
 from vllm.renderers import ChatParams
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import TokenizerLike
-from vllm.tool_parsers import ToolParser
 from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
 from vllm.tool_parsers.utils import partial_json_loads
 from vllm.utils.collection_utils import as_list
@@ -134,6 +134,12 @@ class OpenAIServingChat(OpenAIServing):
            enable_auto_tools=enable_auto_tools,
            model_name=self.model_config.model,
        )
+        self.parser_cls = ParserManager.get_parser(
+            tool_parser_name=tool_parser,
+            reasoning_parser_name=reasoning_parser,
+            enable_auto_tools=enable_auto_tools,
+            model_name=self.model_config.model,
+        )
        self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none

        self.enable_prompt_tokens_details = enable_prompt_tokens_details
@@ -216,13 +222,12 @@ class OpenAIServingChat(OpenAIServing):
        # Streaming response
        tokenizer = self.renderer.tokenizer
        assert tokenizer is not None
-        reasoning_parser: ReasoningParser | None = None
-        if self.reasoning_parser_cls:
-            # Pass the same chat template kwargs as used in tokenization
        chat_template_kwargs = self._prepare_extra_chat_template_kwargs(
            request.chat_template_kwargs,
            self.default_chat_template_kwargs,
        )
+        reasoning_parser: ReasoningParser | None = None
+        if self.reasoning_parser_cls:
            reasoning_parser = self.reasoning_parser_cls(
                tokenizer,
                chat_template_kwargs=chat_template_kwargs,  # type: ignore[call-arg]
@@ -338,6 +343,7 @@ class OpenAIServingChat(OpenAIServing):
                tokenizer,
                request_metadata,
                reasoning_parser,
+                chat_template_kwargs=chat_template_kwargs,
            )

        return await self.chat_completion_full_generator(
@@ -505,6 +511,7 @@ class OpenAIServingChat(OpenAIServing):
        tokenizer: TokenizerLike,
        request_metadata: RequestResponseMetadata,
        reasoning_parser: ReasoningParser | None = None,
+        chat_template_kwargs: dict[str, Any] | None = None,
    ) -> AsyncGenerator[str, None]:
        created_time = int(time.time())
        chunk_object_type: Final = "chat.completion.chunk"
@@ -549,29 +556,29 @@ class OpenAIServingChat(OpenAIServing):
        if tool_choice_auto or reasoning_parser:
            # These are only required in "auto" tool choice case
            all_previous_token_ids = [[] for _ in range(num_choices)]
-            # For reasoning parser and tool call all enabled
-            added_content_delta_arr = [False] * num_choices
            reasoning_end_arr = [False] * num_choices
            prompt_is_reasoning_end_arr: list[bool | None] = [None] * num_choices
        else:
            all_previous_token_ids = None

-        # Prepare the tool parser if it's needed
        try:
-            if tool_choice_auto and self.tool_parser:
+            if self.parser_cls is not None:
                if tokenizer is None:
                    raise ValueError(
                        "Tokenizer not available when `skip_tokenizer_init=True`"
                    )
-
-                tool_parsers: list[ToolParser | None] = [
-                    self.tool_parser(tokenizer, request.tools)
+                parsers: list[Parser | None] = [
+                    self.parser_cls(
+                        tokenizer,
+                        request.tools,
+                        chat_template_kwargs=chat_template_kwargs,
+                    )
                    for _ in range(num_choices)
                ]
            else:
-                tool_parsers = [None] * num_choices
+                parsers = [None] * num_choices
        except Exception as e:
-            logger.exception("Error in tool parser creation.")
+            logger.exception("Error in parser creation.")
            data = self.create_streaming_error_response(e)
            yield f"data: {data}\n\n"
            yield "data: [DONE]\n\n"
@@ -675,7 +682,8 @@ class OpenAIServingChat(OpenAIServing):

                for output in res.outputs:
                    i = output.index
-                    tool_parser = tool_parsers[i]
+                    parser = parsers[i]
+                    tool_parser = parser.tool_parser if parser is not None else None

                    if (
                        reasoning_parser
@@ -903,109 +911,16 @@ class OpenAIServingChat(OpenAIServing):
                                history_tool_call_cnt += 1
                                tools_streamed[i] = True

-                    # handle streaming deltas for tools with "auto" tool choice
-                    # and reasoning parser
-                    elif tool_choice_auto and reasoning_parser:
-                        assert tool_parser is not None
-                        assert added_content_delta_arr is not None
-                        assert reasoning_end_arr is not None
-                        output_token_ids = as_list(output.token_ids)
-                        if not reasoning_end_arr[i]:
-                            # When encountering think end id in prompt_token_ids
-                            # i.e {"enable_thinking": False},
-                            # set reasoning status to end.
-                            if prompt_is_reasoning_end_arr[i]:
-                                reasoning_end_arr[i] = True
-                                current_token_ids = output_token_ids
-                                # Don't update current_text, keep it as is from delta
-                            else:
-                                delta_message = (
-                                    reasoning_parser.extract_reasoning_streaming(
-                                        previous_text,
-                                        current_text,
-                                        delta_text,
-                                        previous_token_ids,
-                                        current_token_ids,
-                                        output_token_ids,
-                                    )
-                                )
-
-                                # When encountering think end id in delta_token_ids,
-                                # set reasoning status to end.
-                                # Remove the text and token ids related
-                                # to 'reasoning'.
-                                if reasoning_parser.is_reasoning_end(output_token_ids):
-                                    reasoning_end_arr[i] = True
-                                    current_token_ids = (
-                                        reasoning_parser.extract_content_ids(
-                                            output_token_ids
-                                        )
-                                    )
-                                    if delta_message and delta_message.content:
-                                        current_text = delta_message.content
-                                        delta_message.content = None
-                                    else:
-                                        current_text = ""
-
-                        # handle tool calls only after reasoning is done,
-                        if reasoning_end_arr[i]:
-                            delta_token_ids = output_token_ids
-                            # First time to tool call,
-                            # add the remaining text and token ids
-                            # to delta from previous
-                            if not added_content_delta_arr[i]:
-                                added_content_delta_arr[i] = True
-                                previous_text = ""
-                                previous_token_ids = []
-                                delta_text = current_text
-                                delta_token_ids = current_token_ids
-
-                            delta_message = tool_parser.extract_tool_calls_streaming(
-                                previous_text=previous_text,
-                                current_text=current_text,
-                                delta_text=delta_text,
-                                previous_token_ids=previous_token_ids,
-                                current_token_ids=current_token_ids,
-                                delta_token_ids=delta_token_ids,
-                                request=request,
-                            )
-                            if delta_message and delta_message.tool_calls:
-                                tools_streamed[i] = True
-                    # when only tool calls
-                    elif tool_choice_auto:
-                        assert tool_parser is not None
-                        delta_message = tool_parser.extract_tool_calls_streaming(
-                            previous_text=previous_text,
-                            current_text=current_text,
+                    elif parser is not None:
+                        delta_message = parser.parse_delta(
                            delta_text=delta_text,
-                            previous_token_ids=previous_token_ids,
-                            current_token_ids=current_token_ids,
-                            delta_token_ids=output.token_ids,
+                            delta_token_ids=as_list(output.token_ids),
                            request=request,
+                            prompt_token_ids=res.prompt_token_ids,
                        )
                        if delta_message and delta_message.tool_calls:
                            tools_streamed[i] = True
-
-                    # when only reasoning
-                    elif reasoning_parser:
-                        # When encountering think end id in prompt_token_ids
-                        # i.e {"enable_thinking": False},
-                        # set reasoning status to end.
-                        # Route all generated tokens as content directly.
-                        if prompt_is_reasoning_end_arr[i]:
-                            delta_message = DeltaMessage(content=delta_text)
-                        else:
-                            delta_message = (
-                                reasoning_parser.extract_reasoning_streaming(
-                                    previous_text,
-                                    current_text,
-                                    delta_text,
-                                    previous_token_ids,
-                                    current_token_ids,
-                                    output.token_ids,
-                                )
-                            )
-                    # handle streaming just a content delta
+                    # handle streaming just a content delta (no parsers)
                    else:
                        delta_message = DeltaMessage(content=delta_text)


--- a/vllm/parser/abstract_parser.py
+++ b/vllm/parser/abstract_parser.py
@@ -665,10 +665,14 @@ class _WrappedParser(DelegatingParser):
    reasoning_parser_cls: type[ReasoningParser] | None = None
    tool_parser_cls: type[ToolParser] | None = None

-    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
+    def __init__(
+        self, tokenizer: TokenizerLike, tools: list[Tool] | None = None, **kwargs
+    ):
        super().__init__(tokenizer)
        # Instantiate the underlying parsers from class attributes
        if self.__class__.reasoning_parser_cls is not None:
-            self._reasoning_parser = self.__class__.reasoning_parser_cls(tokenizer)
+            self._reasoning_parser = self.__class__.reasoning_parser_cls(
+                tokenizer, **kwargs
+            )
        if self.__class__.tool_parser_cls is not None:
            self._tool_parser = self.__class__.tool_parser_cls(tokenizer, tools)