Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

a99300bd · zhuwenwen · cc3e01c7 · 5438967f · a99300bd · a99300bd
Commit a99300bd authored Sep 09, 2025 by zhuwenwen
20 changed files
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -4,11 +4,11 @@
 import asyncio
 import json
 import time
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncGenerator, AsyncIterator, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
 from http import HTTPStatus
-from typing import Any, Callable, Final, Optional, Union
+from typing import Callable, Final, Optional, Union

 import jinja2
 import openai.types.responses as openai_responses_types
@@ -25,6 +25,8 @@ from openai.types.responses import (ResponseCreatedEvent,
                                    ResponseReasoningItem,
                                    ResponseReasoningTextDeltaEvent,
                                    ResponseReasoningTextDoneEvent)
+from openai.types.responses.response_output_text import (Logprob,
+                                                         LogprobTopLogprob)
 # yapf: enable
 from openai.types.responses.response_reasoning_item import (
    Content as ResponseReasoningTextContent)
@@ -59,6 +61,8 @@ from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob as SampleLogprob
+from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid

@@ -84,6 +88,7 @@ class OpenAIServingResponses(OpenAIServing):
        enable_prompt_tokens_details: bool = False,
        enable_force_include_usage: bool = False,
        enable_log_outputs: bool = False,
+        log_error_stack: bool = False,
    ) -> None:
        super().__init__(
            engine_client=engine_client,
@@ -92,6 +97,7 @@ class OpenAIServingResponses(OpenAIServing):
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            enable_force_include_usage=enable_force_include_usage,
+            log_error_stack=log_error_stack,
        )

        self.chat_template = chat_template
@@ -201,6 +207,12 @@ class OpenAIServingResponses(OpenAIServing):
            # (i.e., their request's `store=True` just because it's the default
            # value).
            request.store = False
+        if self.use_harmony and request.is_include_output_logprobs():
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message="logprobs are not supported with gpt-oss models",
+                status_code=HTTPStatus.BAD_REQUEST,
+            )

        # Handle the previous response ID.
        prev_response_id = request.previous_response_id
@@ -238,10 +250,10 @@ class OpenAIServingResponses(OpenAIServing):
            raw_request.state.request_metadata = request_metadata

        if self.tool_server is not None and isinstance(
-                self.tool_server, MCPToolServer
-        ) and (request.background or request.stream) and request.tools and any(
-                tool.type in ["web_search_preview", "code_interpreter"]
-                for tool in request.tools):
+                self.tool_server,
+                MCPToolServer) and request.stream and request.tools and any(
+                    tool.type in ["web_search_preview", "code_interpreter"]
+                    for tool in request.tools):
            return self.create_error_response(
                "MCP tool server is not supported in background mode and "
                "streaming mode")
@@ -255,114 +267,70 @@ class OpenAIServingResponses(OpenAIServing):
                builtin_tool_list.append("browser")
            if self.tool_server.has_tool("python"):
                builtin_tool_list.append("python")
-        async with AsyncExitStack() as exit_stack:
-            try:
-                if self.tool_server is not None:
-                    # TODO: initialize tool sessions lazily when the session
-                    # is actually used.
-                    tool_session_ctxs: dict[str, Any] = {
-                        tool_name:
-                        exit_stack.enter_async_context(
-                            self.tool_server.new_session(tool_name))
-                        for tool_name in builtin_tool_list
-                    }
-                    tool_sessions = {}
-                    for tool_name in builtin_tool_list:
-                        tool_sessions[tool_name] = (
-                            await tool_session_ctxs[tool_name])
-                else:
-                    assert len(builtin_tool_list) == 0
-                    tool_sessions = {}
-                for i, engine_prompt in enumerate(engine_prompts):
-                    default_max_tokens = self.max_model_len - len(
-                        engine_prompt["prompt_token_ids"])
-                    sampling_params = request.to_sampling_params(
-                        default_max_tokens, self.default_sampling_params)
-
-                    trace_headers = (None if raw_request is None else await
-                                     self._get_trace_headers(
-                                         raw_request.headers))
-
-                    context: ConversationContext
-                    if self.use_harmony:
-                        if request.stream:
-                            context = StreamingHarmonyContext(
-                                messages, tool_sessions)
-                        else:
-                            context = HarmonyContext(messages, tool_sessions)
-                    else:
-                        context = SimpleContext()
-                    generator = self._generate_with_builtin_tools(
-                        request_id=request.request_id,
-                        request_prompt=request_prompts[i],
-                        engine_prompt=engine_prompt,
-                        sampling_params=sampling_params,
-                        context=context,
-                        lora_request=lora_request,
-                        priority=request.priority,
-                        trace_headers=trace_headers,
-                    )
-                    generators.append(generator)
-            except ValueError as e:
-                # TODO: Use a vllm-specific Validation Error
-                return self.create_error_response(str(e))

-            assert len(generators) == 1
-            result_generator, = generators
-
-            # Store the input messages.
-            if request.store:
-                self.msg_store[request.request_id] = messages
-
-            if request.background:
-                created_time = int(time.time())
-                response = ResponsesResponse.from_request(
-                    request,
-                    sampling_params,
-                    model_name=model_name,
-                    created_time=created_time,
-                    output=[],
-                    status="queued",
-                    usage=None,
+        if self.tool_server is not None:
+            available_tools = builtin_tool_list
+        else:
+            assert len(builtin_tool_list) == 0
+            available_tools = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
+                default_max_tokens = self.max_model_len - len(
+                    engine_prompt["prompt_token_ids"])
+                sampling_params = request.to_sampling_params(
+                    default_max_tokens, self.default_sampling_params)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                context: ConversationContext
+                if self.use_harmony:
+                    if request.stream:
+                        context = StreamingHarmonyContext(
+                            messages, available_tools)
+                    else:
+                        context = HarmonyContext(messages, available_tools)
+                else:
+                    context = SimpleContext()
+                generator = self._generate_with_builtin_tools(
+                    request_id=request.request_id,
+                    request_prompt=request_prompts[i],
+                    engine_prompt=engine_prompt,
+                    sampling_params=sampling_params,
+                    context=context,
+                    lora_request=lora_request,
+                    priority=request.priority,
+                    trace_headers=trace_headers,
                )
-                async with self.response_store_lock:
-                    self.response_store[response.id] = response
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))

-                # Run the request in the background.
-                task = asyncio.create_task(
-                    self._run_background_request(
-                        request,
-                        sampling_params,
-                        result_generator,
-                        context,
-                        model_name,
-                        tokenizer,
-                        request_metadata,
-                        created_time,
-                    ),
-                    name=f"create_{response.id}",
-                )
+        assert len(generators) == 1
+        result_generator, = generators

-                # For cleanup.
-                response_id = response.id
-                self.background_tasks[response_id] = task
-                task.add_done_callback(
-                    lambda _: self.background_tasks.pop(response_id, None))
-                return response
+        # Store the input messages.
+        if request.store:
+            self.msg_store[request.request_id] = messages

-            if request.stream:
-                return self.responses_stream_generator(
-                    request,
-                    sampling_params,
-                    result_generator,
-                    context,
-                    model_name,
-                    tokenizer,
-                    request_metadata,
-                )
+        if request.background:
+            created_time = int(time.time())
+            response = ResponsesResponse.from_request(
+                request,
+                sampling_params,
+                model_name=model_name,
+                created_time=created_time,
+                output=[],
+                status="queued",
+                usage=None,
+            )
+            async with self.response_store_lock:
+                self.response_store[response.id] = response

-            try:
-                return await self.responses_full_generator(
+            # Run the request in the background.
+            task = asyncio.create_task(
+                self._run_background_request(
                    request,
                    sampling_params,
                    result_generator,
@@ -370,10 +338,41 @@ class OpenAIServingResponses(OpenAIServing):
                    model_name,
                    tokenizer,
                    request_metadata,
-                )
-            except Exception as e:
-                return self.create_error_response(str(e))
-        return self.create_error_response("Should not reach here")
+                    created_time,
+                ),
+                name=f"create_{response.id}",
+            )
+
+            # For cleanup.
+            response_id = response.id
+            self.background_tasks[response_id] = task
+            task.add_done_callback(
+                lambda _: self.background_tasks.pop(response_id, None))
+            return response
+
+        if request.stream:
+            return self.responses_stream_generator(
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+
+        try:
+            return await self.responses_full_generator(
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+        except Exception as e:
+            return self.create_error_response(str(e))

    async def _make_request(
        self,
@@ -408,6 +407,11 @@ class OpenAIServingResponses(OpenAIServing):
            request, prev_response)
        prompt_token_ids = render_for_completion(messages)
        engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+
+        # Add cache_salt if provided in the request
+        if request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
        return messages, [prompt_token_ids], [engine_prompt]

    async def responses_full_generator(
@@ -424,14 +428,16 @@ class OpenAIServingResponses(OpenAIServing):
        if created_time is None:
            created_time = int(time.time())

-        try:
-            async for _ in result_generator:
-                pass
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+        async with AsyncExitStack() as exit_stack:
+            try:
+                await context.init_tool_sessions(self.tool_server, exit_stack)
+                async for _ in result_generator:
+                    pass
+            except asyncio.CancelledError:
+                return self.create_error_response("Client disconnected")
+            except ValueError as e:
+                # TODO: Use a vllm-specific Validation Error
+                return self.create_error_response(str(e))

        if self.use_harmony:
            assert isinstance(context, HarmonyContext)
@@ -486,6 +492,51 @@ class OpenAIServingResponses(OpenAIServing):
                    self.response_store[response.id] = response
        return response

+    def _topk_logprobs(self, logprobs: dict[int,
+                                            SampleLogprob], top_logprobs: int,
+                       tokenizer: AnyTokenizer) -> list[LogprobTopLogprob]:
+        """Returns the top-k logprobs from the logprobs dictionary."""
+        out = []
+        for i, (token_id, _logprob) in enumerate(logprobs.items()):
+            if i >= top_logprobs:
+                break
+            text = _logprob.decoded_token if _logprob.decoded_token \
+                is not None else tokenizer.decode([token_id])
+            out.append(
+                LogprobTopLogprob(
+                    token=text,
+                    logprob=max(_logprob.logprob, -9999.0),
+                    bytes=list(text.encode("utf-8", errors="replace")),
+                ))
+        return out
+
+    def _create_response_logprobs(
+            self,
+            token_ids: Sequence[int],
+            logprobs: Optional[SampleLogprobs],
+            tokenizer: AnyTokenizer,
+            top_logprobs: Optional[int] = None) -> list[Logprob]:
+        assert logprobs is not None, "logprobs must be provided"
+        assert len(token_ids) == len(logprobs), (
+            "token_ids and logprobs.token_ids must have the same length")
+        out = []
+        for i, token_id in enumerate(token_ids):
+            logprob = logprobs[i]
+            token_logprob = logprob[token_id]
+            text = token_logprob.decoded_token if token_logprob.decoded_token \
+                is not None else tokenizer.decode([token_id])
+            out.append(
+                Logprob(
+                    token=text,
+                    logprob=max(token_logprob.logprob, -9999.0),
+                    bytes=list(text.encode("utf-8", errors="replace")),
+                    top_logprobs=self._topk_logprobs(logprob,
+                                                     top_logprobs=top_logprobs,
+                                                     tokenizer=tokenizer)
+                    if top_logprobs else [],
+                ))
+        return out
+
    def _make_response_output_items(
        self,
        request: ResponsesRequest,
@@ -542,7 +593,12 @@ class OpenAIServingResponses(OpenAIServing):
                text=content,
                annotations=[],  # TODO
                type="output_text",
-                logprobs=None,  # TODO
+                logprobs=self._create_response_logprobs(
+                    token_ids=final_output.token_ids,
+                    logprobs=final_output.logprobs,
+                    tokenizer=tokenizer,
+                    top_logprobs=request.top_logprobs,
+                ) if request.is_include_output_logprobs() else None,
            )
            message = ResponseOutputMessage(
                id=f"msg_{random_uuid()}",
@@ -773,7 +829,7 @@ class OpenAIServingResponses(OpenAIServing):
            status_code=HTTPStatus.BAD_REQUEST,
        )

-    async def responses_stream_generator(
+    async def _process_streaming_events(
        self,
        request: ResponsesRequest,
        sampling_params: SamplingParams,
@@ -782,18 +838,8 @@ class OpenAIServingResponses(OpenAIServing):
        model_name: str,
        tokenizer: AnyTokenizer,
        request_metadata: RequestResponseMetadata,
-        created_time: Optional[int] = None,
+        created_time: int,
    ) -> AsyncGenerator[str, None]:
-        # TODO:
-        # 1. Handle disconnect
-
-        if not isinstance(context, StreamingHarmonyContext):
-            raise NotImplementedError(
-                "Streaming is not supported for responses API without Harmony."
-            )
-
-        created_time = created_time or int(time.time())
-
        sequence_number = 0

        def _send_event(event: BaseModel):
@@ -1004,7 +1050,48 @@ class OpenAIServingResponses(OpenAIServing):
                            delta=ctx.parser.last_content_delta,
                            sequence_number=-1,
                        ))
-
+                # built-in tools will be triggered on the analysis channel
+                # However, occasionally built-in tools will
+                # still be output to commentary.
+                elif (ctx.parser.current_channel == "commentary"
+                      or ctx.parser.current_channel == "analysis"
+                      ) and ctx.parser.current_recipient == "python":
+                    if not sent_output_item_added:
+                        sent_output_item_added = True
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.
+                                ResponseCodeInterpreterToolCallParam(
+                                    type="code_interpreter_call",
+                                    id=current_item_id,
+                                    code=None,
+                                    container_id="auto",
+                                    outputs=None,
+                                    status="in_progress",
+                                ),
+                            ))
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseCodeInterpreterCallInProgressEvent(
+                                type=
+                                "response.code_interpreter_call.in_progress",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                            ))
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseCodeInterpreterCallCodeDeltaEvent(
+                            type="response.code_interpreter_call_code.delta",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=ctx.parser.last_content_delta,
+                        ))
            if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0:
                previous_item = ctx.parser.messages[-1]
                if (self.tool_server is not None
@@ -1100,30 +1187,6 @@ class OpenAIServingResponses(OpenAIServing):
                        and self.tool_server.has_tool("python")
                        and previous_item.recipient is not None
                        and previous_item.recipient.startswith("python")):
-                    yield _send_event(
-                        openai_responses_types.ResponseOutputItemAddedEvent(
-                            type="response.output_item.added",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item=openai_responses_types.
-                            ResponseCodeInterpreterToolCallParam(
-                                type="code_interpreter_call",
-                                id=current_item_id,
-                                code="",
-                                container_id="auto",
-                                outputs=[],
-                                status="in_progress",
-                            ),
-                        ))
-                    yield _send_event(
-                        openai_responses_types.
-                        ResponseCodeInterpreterCallInProgressEvent(
-                            type="response.code_interpreter_call.in_progress",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                        ))
-                    # TODO: do we need to add delta event here?
                    yield _send_event(
                        openai_responses_types.
                        ResponseCodeInterpreterCallCodeDoneEvent(
@@ -1131,7 +1194,8 @@ class OpenAIServingResponses(OpenAIServing):
                            sequence_number=-1,
                            output_index=current_output_index,
                            item_id=current_item_id,
-                            code=previous_item.content[0].text))
+                            code=previous_item.content[0].text,
+                        ))
                    yield _send_event(
                        openai_responses_types.
                        ResponseCodeInterpreterCallInterpretingEvent(
@@ -1187,3 +1251,31 @@ class OpenAIServingResponses(OpenAIServing):
                sequence_number=-1,
                response=final_response.model_dump(),
            ))
+
+    async def responses_stream_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        result_generator: AsyncIterator[Optional[ConversationContext]],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> AsyncGenerator[str, None]:
+        # TODO:
+        # 1. Handle disconnect
+
+        if not isinstance(context, StreamingHarmonyContext):
+            raise NotImplementedError(
+                "Streaming is not supported for responses API without Harmony."
+            )
+
+        created_time = created_time or int(time.time())
+
+        async with AsyncExitStack() as exit_stack:
+            await context.init_tool_sessions(self.tool_server, exit_stack)
+            async for event_data in self._process_streaming_events(
+                    request, sampling_params, result_generator, context,
+                    model_name, tokenizer, request_metadata, created_time):
+                yield event_data
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -7,7 +7,6 @@ from typing import Any, Optional, Union

 from fastapi import Request

-from vllm import envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
@@ -47,11 +46,13 @@ class ServingScores(OpenAIServing):
        models: OpenAIServingModels,
        *,
        request_logger: Optional[RequestLogger],
+        log_error_stack: bool = False,
    ) -> None:
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)

    async def _embedding_score(
        self,
@@ -227,8 +228,7 @@ class ServingScores(OpenAIServing):
                             params=default_pooling_params,
                             lora_request=lora_request)

-            if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
-                    "token_type_ids", None)):
+            if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
                pooling_params = default_pooling_params.clone()
                compressed = compress_token_type_ids(token_type_ids)
                pooling_params.extra_kwargs = {
@@ -266,12 +266,14 @@ class ServingScores(OpenAIServing):
        request: Union[ScoreRequest, RerankRequest],
        request_id: str,
        raw_request: Optional[Request] = None,
-        truncate_prompt_tokens: Optional[int] = None,
    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
        lora_request = self._maybe_get_adapters(request)

        tokenizer = await self.engine_client.get_tokenizer(lora_request)

+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
        tokenization_kwargs: dict[str, Any] = {}
        _validate_truncation_size(self.max_model_len, truncate_prompt_tokens,
                                  tokenization_kwargs)
@@ -343,7 +345,6 @@ class ServingScores(OpenAIServing):
                request,
                request_id,
                raw_request,
-                request.truncate_prompt_tokens,
            )
            if isinstance(final_res_batch, ErrorResponse):
                return final_res_batch
@@ -391,7 +392,6 @@ class ServingScores(OpenAIServing):
                request,
                request_id,
                raw_request,
-                request.truncate_prompt_tokens,
            )
            if isinstance(final_res_batch, ErrorResponse):
                return final_res_batch

--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -39,11 +39,13 @@ class OpenAIServingTokenization(OpenAIServing):
        request_logger: Optional[RequestLogger],
        chat_template: Optional[str],
        chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
    ) -> None:
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)

        self.chat_template = chat_template
        self.chat_template_content_format: Final = chat_template_content_format

--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -32,13 +32,15 @@ class OpenAIServingTranscription(OpenAISpeechToText):
        *,
        request_logger: Optional[RequestLogger],
        return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
    ):
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         models=models,
                         request_logger=request_logger,
                         return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="transcribe")
+                         task_type="transcribe",
+                         log_error_stack=log_error_stack)

    async def create_transcription(
        self, audio_data: bytes, request: TranscriptionRequest,
@@ -88,13 +90,15 @@ class OpenAIServingTranslation(OpenAISpeechToText):
        *,
        request_logger: Optional[RequestLogger],
        return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
    ):
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         models=models,
                         request_logger=request_logger,
                         return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="translate")
+                         task_type="translate",
+                         log_error_stack=log_error_stack)

    async def create_translation(
        self, audio_data: bytes, request: TranslationRequest,

--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -53,12 +53,14 @@ class OpenAISpeechToText(OpenAIServing):
        request_logger: Optional[RequestLogger],
        return_tokens_as_token_ids: bool = False,
        task_type: Literal["transcribe", "translate"] = "transcribe",
+        log_error_stack: bool = False,
    ):
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         models=models,
                         request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids)
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         log_error_stack=log_error_stack)

        self.default_sampling_params = (
            self.model_config.get_diff_sampling_param())
@@ -200,7 +202,22 @@ class OpenAISpeechToText(OpenAIServing):
            for result_generator in list_result_generator:
                async for op in result_generator:
                    text += op.outputs[0].text
-            return cast(T, response_class(text=text))
+
+            if self.task_type == "transcribe":
+                # add usage in TranscriptionResponse.
+                usage = {
+                    "type": "duration",
+                    # rounded up as per openAI specs
+                    "seconds": int(math.ceil(duration_s)),
+                }
+                final_response = cast(T, response_class(text=text,
+                                                        usage=usage))
+            else:
+                # no usage in response for translation task
+                final_response = cast(
+                    T, response_class(text=text))  # type: ignore[call-arg]
+
+            return final_response
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
        except ValueError as e:

--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -3,6 +3,7 @@

 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .deepseekv3_tool_parser import DeepSeekV3ToolParser
+from .deepseekv31_tool_parser import DeepSeekV31ToolParser
 from .glm4_moe_tool_parser import Glm4MoeModelToolParser
 from .granite_20b_fc_tool_parser import Granite20bFCToolParser
 from .granite_tool_parser import GraniteToolParser
@@ -18,6 +19,7 @@ from .mistral_tool_parser import MistralToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 from .qwen3coder_tool_parser import Qwen3CoderToolParser
+from .seed_oss_tool_parser import SeedOssToolParser
 from .step3_tool_parser import Step3ToolParser
 from .xlam_tool_parser import xLAMToolParser

@@ -35,11 +37,13 @@ __all__ = [
    "PythonicToolParser",
    "Phi4MiniJsonToolParser",
    "DeepSeekV3ToolParser",
+    "DeepSeekV31ToolParser",
    "xLAMToolParser",
    "MinimaxToolParser",
    "KimiK2ToolParser",
    "HunyuanA13BToolParser",
    "Glm4MoeModelToolParser",
    "Qwen3CoderToolParser",
+    "SeedOssToolParser",
    "Step3ToolParser",
 ]
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Union
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("deepseek_v31")
+class DeepSeekV31ToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = (
+            [])  # map what has been streamed for each tool so far to a list
+
+        self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
+        self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"
+
+        self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
+        self.tool_call_end_token: str = "<｜tool▁call▁end｜>"
+
+        self.tool_call_regex = re.compile(
+            r"<｜tool▁call▁begin｜>(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)<｜tool▁call▁end｜>"
+        )
+
+        self.stream_tool_call_portion_regex = re.compile(
+            r"(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)")
+
+        self.stream_tool_call_name_regex = re.compile(
+            r"(?P<function_name>.*)<｜tool▁sep｜>")
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_calls_start_token_id = self.vocab.get(
+            self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(
+            self.tool_calls_end_token)
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_calls_start_token_id is None
+                or self.tool_calls_end_token_id is None):
+            raise RuntimeError(
+                "DeepSeek-V3.1 Tool parser could not locate tool call "
+                "start/end tokens in the tokenizer!")
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = self.tool_call_regex.findall(
+                    model_output)
+
+                tool_calls = []
+                for match in function_call_tuples:
+                    function_name, function_args = match
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(name=function_name,
+                                                  arguments=function_args),
+                        ))
+
+                content = model_output[:model_output.
+                                       find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None,
+                )
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_calls_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+        delta_text = delta_text.replace(self.tool_calls_start_token,
+                                        "").replace(self.tool_calls_end_token,
+                                                    "")
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if self.prev_tool_call_arr is None or len(
+                        self.prev_tool_call_arr) == 0:
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = (diff.encode("utf-8").decode("unicode_escape")
+                            if diff is str else diff)
+                    if '"}' not in delta_text:
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s",
+                        diff,
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=diff).model_dump(exclude_none=True),
+                        )
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            current_tool_call = dict()
+            if tool_call_portion:
+                current_tool_call_matches = (
+                    self.stream_tool_call_portion_regex.match(
+                        tool_call_portion))
+                if current_tool_call_matches:
+                    tool_name, tool_args = current_tool_call_matches.groups()
+                    current_tool_call["name"] = tool_name
+                    current_tool_call["arguments"] = tool_args
+                else:
+                    current_tool_call_name_matches = (
+                        self.stream_tool_call_name_regex.match(
+                            tool_call_portion))
+                    if current_tool_call_name_matches:
+                        tool_name = current_tool_call_name_matches.groups()
+                        current_tool_call["name"] = tool_name
+                        current_tool_call["arguments"] = ""
+                    else:
+                        logger.debug("Not enough token")
+                        return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if current_tool_call is None:
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            type="function",
+                            id=make_tool_call_id(),
+                            function=DeltaFunctionCall(
+                                name=function_name).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                else:
+                    return None
+
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = (DeltaMessage(
+                    content=delta_text) if text_portion is not None else None)
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                "arguments")
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        function=DeltaFunctionCall(
+                            arguments=cur_arguments).model_dump(
+                                exclude_none=True),
+                    )
+                ])
+                self.streamed_args_for_tool[
+                    self.current_tool_id] = cur_arguments
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if (isinstance(delta_text, str)
+                        and cur_arguments != prev_arguments
+                        and len(cur_arguments) > len(prev_arguments)
+                        and cur_arguments.startswith(prev_arguments)):
+                    delta_arguments = cur_arguments[len(prev_arguments):]
+                    logger.debug("got diff %s", delta_text)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=delta_arguments).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments
+                else:
+                    delta = None
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[
+                    self.current_tool_id] = current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -6,7 +6,7 @@ from typing import Union

 import regex as re

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
                        DeltaToolCall(
                            index=self.current_tool_id,
                            type="function",
-                            id=random_tool_call_id(),
+                            id=make_tool_call_id(),
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),

--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -10,7 +10,7 @@ import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -203,7 +203,7 @@ class Granite20bFCToolParser(ToolParser):
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))

--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -8,7 +8,7 @@ from typing import Union
 import partial_json_parser
 from partial_json_parser.core.options import Allow

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -185,7 +185,7 @@ class GraniteToolParser(ToolParser):
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))

--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -9,7 +9,7 @@ import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -307,7 +307,7 @@ class Hermes2ProToolParser(ToolParser):
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))

--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -8,7 +8,7 @@ from typing import Union
 import partial_json_parser
 from partial_json_parser.core.options import Allow

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -107,7 +107,7 @@ class Internlm2ToolParser(ToolParser):
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))

--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -9,7 +9,7 @@ import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -222,7 +222,7 @@ class JambaToolParser(ToolParser):
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))

--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -10,7 +10,7 @@ import regex as re
 from partial_json_parser.core.options import Allow
 from transformers import PreTrainedTokenizerBase

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -213,7 +213,7 @@ class Llama3JsonToolParser(ToolParser):
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))

--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -7,7 +7,7 @@ from typing import Any, Optional, Union

 import regex as re

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -394,7 +394,7 @@ class MinimaxToolParser(ToolParser):
            sent_tools.append({
                "sent_name": False,
                "sent_arguments": "",
-                "id": random_tool_call_id(),
+                "id": make_tool_call_id(),
            })

        while len(tool_ids) < tool_count:
@@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
                i += 1
        return boundaries

-    def _extract_tool_args(self, tool_content: str, args_match) -> str:
+    def _extract_tool_args(self, tool_content: str,
+                           args_match: re.Match[str]) -> str:
        """
        Extract tool arguments from tool content.
        

--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -8,7 +8,7 @@ from typing import Any, Optional
 import regex as re
 from transformers import PreTrainedTokenizerBase

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaMessage,
                                              ExtractedToolCallInformation,
@@ -74,7 +74,7 @@ class Phi4MiniJsonToolParser(ToolParser):

            tool_calls: list[ToolCall] = [
                ToolCall(
-                    id=random_tool_call_id(),
+                    id=make_tool_call_id(),
                    type="function",
                    function=FunctionCall(
                        name=raw_function_call["name"],

--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import ast
 import json
 import uuid
 from collections.abc import Sequence
@@ -22,7 +22,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)


-@ToolParserManager.register_module(["qwen3_coder"])
+@ToolParserManager.register_module("qwen3_coder")
 class Qwen3CoderToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
@@ -30,6 +30,8 @@ class Qwen3CoderToolParser(ToolParser):

        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: list[dict] = []
+        # Override base class type - we use string IDs for tool calls
+        self.current_tool_id: Optional[str] = None  # type: ignore
        self.streamed_args_for_tool: list[str] = []

        # Sentinel tokens for streaming mode
@@ -42,20 +44,6 @@ class Qwen3CoderToolParser(ToolParser):
        self.is_tool_call_started: bool = False
        self.failed_count: int = 0

-        # Streaming state variables
-        self.current_tool_index: int = 0
-        self.header_sent: bool = False
-        self.current_tool_string_id: Optional[str] = None
-        self.current_function_name: Optional[str] = None
-        self.current_param_name: Optional[str] = None
-        self.current_param_value: str = ""
-        self.param_count: int = 0
-        self.in_param: bool = False
-        self.in_function: bool = False
-        self.accumulated_text: str = ""
-        self.json_started: bool = False
-        self.json_closed: bool = False
-
        # Enhanced streaming state - reset for each new message
        self._reset_streaming_state()

@@ -67,7 +55,8 @@ class Qwen3CoderToolParser(ToolParser):
        self.tool_call_function_regex = re.compile(
            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
        self.tool_call_parameter_regex = re.compile(
-            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
+            r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
+            re.DOTALL)

        if not self.model_tokenizer:
            raise ValueError(
@@ -84,8 +73,8 @@ class Qwen3CoderToolParser(ToolParser):
                "Qwen3 XML Tool parser could not locate tool call start/end "
                "tokens in the tokenizer!")

-        logger.debug("vLLM Successfully import tool parser %s !",
-                     self.__class__.__name__)
+        logger.info("vLLM Successfully import tool parser %s !",
+                    self.__class__.__name__)

    def _generate_tool_call_id(self) -> str:
        """Generate a unique tool call ID."""
@@ -96,7 +85,7 @@ class Qwen3CoderToolParser(ToolParser):
        self.current_tool_index = 0
        self.is_tool_call_started = False
        self.header_sent = False
-        self.current_tool_string_id = None
+        self.current_tool_id = None
        self.current_function_name = None
        self.current_param_name = None
        self.current_param_value = ""
@@ -106,122 +95,122 @@ class Qwen3CoderToolParser(ToolParser):
        self.accumulated_text = ""
        self.json_started = False
        self.json_closed = False
-
-    def _parse_xml_function_call(
-            self, function_call_str: str,
-            tools: Optional[list[ChatCompletionToolsParam]]
-    ) -> Optional[ToolCall]:
-
-        def get_arguments_config(func_name: str) -> dict:
-            if tools is None:
-                return {}
-            for config in tools:
-                if not hasattr(config, "type") or not (
-                        hasattr(config, "function")
-                        and hasattr(config.function, "name")):
-                    continue
-                if (config.type == "function"
-                        and config.function.name == func_name):
-                    if not hasattr(config.function, "parameters"):
-                        return {}
-                    params = config.function.parameters
-                    if isinstance(params, dict) and "properties" in params:
-                        return params["properties"]
-                    elif isinstance(params, dict):
-                        return params
-                    else:
-                        return {}
-            logger.warning("Tool '%s' is not defined in the tools list.",
-                           func_name)
+        # Store accumulated parameters for type conversion
+        self.accumulated_params = {}
+        self.streaming_request = None
+
+    def _get_arguments_config(
+            self, func_name: str,
+            tools: Optional[list[ChatCompletionToolsParam]]) -> dict:
+        """Extract argument configuration for a function."""
+        if tools is None:
            return {}
+        for config in tools:
+            if not hasattr(config, "type") or not (hasattr(
+                    config, "function") and hasattr(config.function, "name")):
+                continue
+            if config.type == "function" and config.function.name == func_name:
+                if not hasattr(config.function, "parameters"):
+                    return {}
+                params = config.function.parameters
+                if isinstance(params, dict) and "properties" in params:
+                    return params["properties"]
+                elif isinstance(params, dict):
+                    return params
+                else:
+                    return {}
+        logger.warning("Tool '%s' is not defined in the tools list.",
+                       func_name)
+        return {}
+
+    def _convert_param_value(self, param_value: str, param_name: str,
+                             param_config: dict, func_name: str) -> Any:
+        """Convert parameter value based on its type in the schema."""
+        # Handle null value for any type
+        if param_value.lower() == "null":
+            return None

-        def convert_param_value(param_value: str, param_name: str,
-                                param_config: dict, func_name: str) -> Any:
-            # Handle null value for any type
-            if param_value.lower() == "null":
-                return None
-
-            converted_value: Any
-
-            if param_name not in param_config:
-                if param_config != {}:
-                    logger.warning(
-                        "Parsed parameter '%s' is not defined in the tool "
-                        "parameters for tool '%s', directly returning the "
-                        "string value.", param_name, func_name)
-                return param_value
-
-            if (isinstance(param_config[param_name], dict)
-                    and "type" in param_config[param_name]):
-                param_type = str(
-                    param_config[param_name]["type"]).strip().lower()
-            else:
-                param_type = "string"
-            if param_type in [
-                    "string", "str", "text", "varchar", "char", "enum"
-            ]:
+        if param_name not in param_config:
+            if param_config != {}:
+                logger.warning(
+                    "Parsed parameter '%s' is not defined in the tool "
+                    "parameters for tool '%s', directly returning the "
+                    "string value.", param_name, func_name)
+            return param_value
+
+        if isinstance(param_config[param_name],
+                      dict) and "type" in param_config[param_name]:
+            param_type = str(param_config[param_name]["type"]).strip().lower()
+        else:
+            param_type = "string"
+        if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
+            return param_value
+        elif param_type.startswith("int") or param_type.startswith(
+                "uint") or param_type.startswith(
+                    "long") or param_type.startswith(
+                        "short") or param_type.startswith("unsigned"):
+            try:
+                return int(param_value)
+            except (ValueError, TypeError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not an "
+                    "integer in tool '%s', degenerating to string.",
+                    param_value, param_name, func_name)
                return param_value
-            elif (param_type.startswith("int") or param_type.startswith("uint")
-                  or param_type.startswith("long")
-                  or param_type.startswith("short")
-                  or param_type.startswith("unsigned")):
-                try:
-                    converted_value = int(param_value)
-                    return converted_value
-                except ValueError:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not an "
-                        "integer in tool '%s', degenerating to string.",
-                        param_value, param_name, func_name)
+        elif param_type.startswith("num") or param_type.startswith("float"):
+            try:
+                float_param_value = float(param_value)
+                return float_param_value if float_param_value - int(
+                    float_param_value) != 0 else int(float_param_value)
+            except (ValueError, TypeError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not a float "
+                    "in tool '%s', degenerating to string.", param_value,
+                    param_name, func_name)
                return param_value
-            elif (param_type.startswith("num")
-                  or param_type.startswith("float")):
+        elif param_type in ["boolean", "bool", "binary"]:
+            param_value = param_value.lower()
+            if param_value not in ["true", "false"]:
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not a boolean "
+                    "(`true` or `false`) in tool '%s', degenerating to "
+                    "false.", param_value, param_name, func_name)
+            return param_value == "true"
+        else:
+            if param_type in ["object", "array", "arr"
+                              ] or param_type.startswith(
+                                  "dict") or param_type.startswith("list"):
                try:
-                    float_param_value = float(param_value)
-                    converted_value = (float_param_value if float_param_value -
-                                       int(float_param_value) != 0 else
-                                       int(float_param_value))
-                    return converted_value
-                except ValueError:
+                    param_value = json.loads(param_value)
+                    return param_value
+                except (json.JSONDecodeError, TypeError, ValueError):
                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not a float "
-                        "in tool '%s', degenerating to string.", param_value,
-                        param_name, func_name)
-                return param_value
-            elif param_type in ["boolean", "bool", "binary"]:
-                param_value = param_value.lower()
-                if param_value not in ["true", "false"]:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not a "
-                        "boolean (`true` of `false`) in tool '%s', "
-                        "degenerating to false.", param_value, param_name,
+                        "Parsed value '%s' of parameter '%s' cannot be "
+                        "parsed with json.loads in tool '%s', will try "
+                        "other methods to parse it.", param_value, param_name,
                        func_name)
-                return param_value == "true"
-            else:
-                if param_type == "object" or param_type.startswith("dict"):
-                    try:
-                        converted_value = json.loads(param_value)
-                        return converted_value
-                    except json.JSONDecodeError:
-                        logger.warning(
-                            "Parsed value '%s' of parameter '%s' is not a "
-                            "valid JSON object in tool '%s', will try other "
-                            "methods to parse it.", param_value, param_name,
-                            func_name)
+            try:
+                param_value = ast.literal_eval(param_value)  # safer
+            except (ValueError, SyntaxError, TypeError):
                logger.warning(
-                    "Parameter '%s' has unknown type '%s'. "
-                    "The value will be treated as a string.", param_name,
-                    param_type)
-                return param_value
+                    "Parsed value '%s' of parameter '%s' cannot be "
+                    "converted via Python `ast.literal_eval()` in tool "
+                    "'%s', degenerating to string.", param_value, param_name,
+                    func_name)
+            return param_value
+
+    def _parse_xml_function_call(
+            self, function_call_str: str,
+            tools: Optional[list[ChatCompletionToolsParam]]
+    ) -> Optional[ToolCall]:

        # Extract function name
        end_index = function_call_str.index(">")
        function_name = function_call_str[:end_index]
-        param_config = get_arguments_config(function_name)
+        param_config = self._get_arguments_config(function_name, tools)
        parameters = function_call_str[end_index + 1:]
        param_dict = {}
-        for match in self.tool_call_parameter_regex.findall(parameters):
-            match_text = match[0] if match[0] else match[1]
+        for match_text in self.tool_call_parameter_regex.findall(parameters):
            idx = match_text.index(">")
            param_name = match_text[:idx]
            param_value = str(match_text[idx + 1:])
@@ -231,7 +220,7 @@ class Qwen3CoderToolParser(ToolParser):
            if param_value.endswith("\n"):
                param_value = param_value[:-1]

-            param_dict[param_name] = convert_param_value(
+            param_dict[param_name] = self._convert_param_value(
                param_value, param_name, param_config, function_name)
        return ToolCall(
            type="function",
@@ -284,8 +273,7 @@ class Qwen3CoderToolParser(ToolParser):
                for function_call_str in function_calls
            ]

-            # Populate prev_tool_call_arr for serving layer to set
-            # finish_reason
+            # Populate prev_tool_call_arr for serving layer to set finish_reason
            self.prev_tool_call_arr.clear()  # Clear previous calls
            for tool_call in tool_calls:
                if tool_call:
@@ -298,8 +286,8 @@ class Qwen3CoderToolParser(ToolParser):

            # Extract content before tool calls
            content_index = model_output.find(self.tool_call_start_token)
-            content_index = (content_index if content_index >= 0 else
-                             model_output.find(self.tool_call_prefix))
+            idx = model_output.find(self.tool_call_prefix)
+            content_index = content_index if content_index >= 0 else idx
            content = model_output[:content_index]  # .rstrip()

            return ExtractedToolCallInformation(
@@ -324,13 +312,16 @@ class Qwen3CoderToolParser(ToolParser):
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
-        # If no delta text, return None unless it's an EOS token after tool
-        # calls
+        # Store request for type conversion
+        if not previous_text:
+            self._reset_streaming_state()
+            self.streaming_request = request
+
+        # If no delta text, return None unless it's an EOS token after tools
        if not delta_text:
            # Check if this is an EOS token after all tool calls are complete
-            # We check for tool calls in the text even if is_tool_call_started
-            # is False because it might have been reset after processing all
-            # tools
+            # Check for tool calls in text even if is_tool_call_started
+            # is False (might have been reset after processing all tools)
            if (delta_token_ids
                    and self.tool_call_end_token_id not in delta_token_ids):
                # Count complete tool calls
@@ -339,24 +330,19 @@ class Qwen3CoderToolParser(ToolParser):

                # If we have completed tool calls and populated
                # prev_tool_call_arr
-                if (complete_calls > 0 and len(self.prev_tool_call_arr) > 0):
+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                    # Check if all tool calls are closed
-                    open_calls = (
-                        current_text.count(self.tool_call_start_token) -
-                        current_text.count(self.tool_call_end_token))
+                    open_calls = current_text.count(
+                        self.tool_call_start_token) - current_text.count(
+                            self.tool_call_end_token)
                    if open_calls == 0:
-                        # Return empty delta message to allow finish_reason
-                        # processing
+                        # Return empty delta for finish_reason processing
                        return DeltaMessage(content="")
                elif not self.is_tool_call_started and current_text:
                    # This is a regular content response that's now complete
                    return DeltaMessage(content="")
            return None

-        # Check if this is the first call (reset state if needed)
-        if not previous_text:
-            self._reset_streaming_state()
-
        # Update accumulated text
        self.accumulated_text = current_text

@@ -371,11 +357,11 @@ class Qwen3CoderToolParser(ToolParser):
                self.param_count = 0
                self.json_started = False
                self.json_closed = False
+                self.accumulated_params = {}

                # Check if there are more tool calls
-                tool_starts_count = current_text.count(
-                    self.tool_call_start_token)
-                if self.current_tool_index >= tool_starts_count:
+                tool_starts = current_text.count(self.tool_call_start_token)
+                if self.current_tool_index >= tool_starts:
                    # No more tool calls
                    self.is_tool_call_started = False
                # Continue processing next tool
@@ -412,20 +398,20 @@ class Qwen3CoderToolParser(ToolParser):

        # We're in a tool call, find the current tool call portion
        # Need to find the correct tool call based on current_tool_index
-        tool_starts: list[int] = []
+        tool_start_positions: list[int] = []
        idx = 0
        while True:
            idx = current_text.find(self.tool_call_start_token, idx)
            if idx == -1:
                break
-            tool_starts.append(idx)
+            tool_start_positions.append(idx)
            idx += len(self.tool_call_start_token)

-        if self.current_tool_index >= len(tool_starts):
+        if self.current_tool_index >= len(tool_start_positions):
            # No more tool calls to process yet
            return None

-        tool_start_idx = tool_starts[self.current_tool_index]
+        tool_start_idx = tool_start_positions[self.current_tool_index]
        # Find where this tool call ends (or current position if not ended yet)
        tool_end_idx = current_text.find(self.tool_call_end_token,
                                         tool_start_idx)
@@ -438,19 +424,19 @@ class Qwen3CoderToolParser(ToolParser):
        # Looking for function header
        if not self.header_sent:
            if self.tool_call_prefix in tool_text:
-                func_start = (tool_text.find(self.tool_call_prefix) +
-                              len(self.tool_call_prefix))
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
                func_end = tool_text.find(">", func_start)

                if func_end != -1:
                    # Found complete function name
                    self.current_function_name = tool_text[func_start:func_end]
-                    self.current_tool_string_id = self._generate_tool_call_id()
+                    self.current_tool_id = self._generate_tool_call_id()
                    self.header_sent = True
                    self.in_function = True

-                    # IMPORTANT: Add to prev_tool_call_arr immediately when we
-                    # detect a tool call. This ensures
+                    # IMPORTANT: Add to prev_tool_call_arr immediately when
+                    # we detect a tool call. This ensures
                    # finish_reason="tool_calls" even if parsing isn't complete
                    already_added = any(
                        tool.get("name") == self.current_function_name
@@ -466,7 +452,7 @@ class Qwen3CoderToolParser(ToolParser):
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
-                            id=self.current_tool_string_id,
+                            id=self.current_tool_id,
                            function=DeltaFunctionCall(
                                name=self.current_function_name, arguments=""),
                            type="function",
@@ -496,10 +482,11 @@ class Qwen3CoderToolParser(ToolParser):
                # Close JSON
                self.json_closed = True

-                # Extract the complete tool call to update prev_tool_call_arr
-                # with final arguments. Find the function content
-                func_start = (tool_text.find(self.tool_call_prefix) +
-                              len(self.tool_call_prefix))
+                # Extract complete tool call to update
+                # prev_tool_call_arr with final arguments
+                # Find the function content
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
                func_content_end = tool_text.find(self.function_end_token,
                                                  func_start)
                if func_content_end != -1:
@@ -507,15 +494,17 @@ class Qwen3CoderToolParser(ToolParser):
                    # Parse to get the complete arguments
                    try:
                        parsed_tool = self._parse_xml_function_call(
-                            func_content, request.tools if request else None)
+                            func_content, self.streaming_request.tools
+                            if self.streaming_request else None)
                        if parsed_tool:
-                            # Update existing entry in prev_tool_call_arr with
-                            # complete arguments
+                            # Update existing entry in
+                            # prev_tool_call_arr with complete args
                            for i, tool in enumerate(self.prev_tool_call_arr):
-                                if (tool.get("name") ==
-                                        parsed_tool.function.name):
-                                    self.prev_tool_call_arr[i]["arguments"] = (
-                                        parsed_tool.function.arguments)
+                                if tool.get(
+                                        "name") == parsed_tool.function.name:
+                                    args = parsed_tool.function.arguments
+                                    self.prev_tool_call_arr[i][
+                                        "arguments"] = args
                                    break
                    except Exception:
                        pass  # Ignore parsing errors during streaming
@@ -530,73 +519,110 @@ class Qwen3CoderToolParser(ToolParser):
                # Reset state for next tool
                self.in_function = False
                self.json_closed = True
+                self.accumulated_params = {}

                return result

            # Look for parameters
-            # Count how many complete parameters we have processed
-            complete_params = tool_text.count(self.parameter_end_token)
+            # Find all parameter starts
+            param_starts = []
+            idx = 0
+            while True:
+                idx = tool_text.find(self.parameter_prefix, idx)
+                if idx == -1:
+                    break
+                param_starts.append(idx)
+                idx += len(self.parameter_prefix)

            # Check if we should start a new parameter
-            if not self.in_param and self.param_count < complete_params:
-                # Find the unprocessed parameter
-                # Count parameter starts
-                param_starts = []
-                idx = 0
-                while True:
-                    idx = tool_text.find(self.parameter_prefix, idx)
-                    if idx == -1:
-                        break
-                    param_starts.append(idx)
-                    idx += len(self.parameter_prefix)
-
-                if len(param_starts) > self.param_count:
-                    # Process the next parameter
-                    param_idx = param_starts[self.param_count]
-                    param_start = param_idx + len(self.parameter_prefix)
-                    remaining = tool_text[param_start:]
-
-                    if ">" in remaining:
-                        # We have the complete parameter name
-                        name_end = remaining.find(">")
-                        self.current_param_name = remaining[:name_end]
-
-                        # Find the parameter value
-                        value_start = param_start + name_end + 1
-                        value_text = tool_text[value_start:]
-                        if value_text.startswith("\n"):
-                            value_text = value_text[1:]
-
-                        # Find where this parameter ends
-                        param_end_idx = value_text.find(
-                            self.parameter_end_token)
-                        if param_end_idx != -1:
-                            # Complete parameter found
-                            param_value = value_text[:param_end_idx]
-                            if param_value.endswith("\n"):
-                                param_value = param_value[:-1]
-
-                            # Build complete JSON fragment for this parameter
-                            if self.param_count == 0:
-                                json_fragment = (
-                                    '"' + self.current_param_name + '": "' +
-                                    json.dumps(param_value)[1:-1] + '"')
+            if (not self.in_param and self.param_count < len(param_starts)
+                    and len(param_starts) > self.param_count):
+                # Process the next parameter
+                param_idx = param_starts[self.param_count]
+                param_start = param_idx + len(self.parameter_prefix)
+                remaining = tool_text[param_start:]
+
+                if ">" in remaining:
+                    # We have the complete parameter name
+                    name_end = remaining.find(">")
+                    self.current_param_name = remaining[:name_end]
+
+                    # Find the parameter value
+                    value_start = param_start + name_end + 1
+                    value_text = tool_text[value_start:]
+                    if value_text.startswith("\n"):
+                        value_text = value_text[1:]
+
+                    # Find where this parameter ends
+                    param_end_idx = value_text.find(self.parameter_end_token)
+                    if param_end_idx == -1:
+                        # No closing tag, look for next parameter or
+                        # function end
+                        next_param_idx = value_text.find(self.parameter_prefix)
+                        func_end_idx = value_text.find(self.function_end_token)
+
+                        if next_param_idx != -1 and (func_end_idx == -1
+                                                     or next_param_idx
+                                                     < func_end_idx):
+                            param_end_idx = next_param_idx
+                        elif func_end_idx != -1:
+                            param_end_idx = func_end_idx
+                        else:
+                            # Neither found, check if tool call is complete
+                            if self.tool_call_end_token in tool_text:
+                                # Tool call is complete, so parameter
+                                # must be complete too. Use all
+                                # remaining text before function end
+                                param_end_idx = len(value_text)
                            else:
-                                json_fragment = (
-                                    ', "' + self.current_param_name + '": "' +
-                                    json.dumps(param_value)[1:-1] + '"')
-
-                            self.param_count += 1
-
-                            return DeltaMessage(tool_calls=[
-                                DeltaToolCall(
-                                    index=self.current_tool_index,
-                                    function=DeltaFunctionCall(
-                                        arguments=json_fragment),
-                                )
-                            ])
-
-            # Continue parameter value
+                                # Still streaming, wait for more content
+                                return None
+
+                    if param_end_idx != -1:
+                        # Complete parameter found
+                        param_value = value_text[:param_end_idx]
+                        if param_value.endswith("\n"):
+                            param_value = param_value[:-1]
+
+                        # Store raw value for later processing
+                        self.accumulated_params[
+                            self.current_param_name] = param_value
+
+                        # Get parameter configuration for type conversion
+                        param_config = self._get_arguments_config(
+                            self.current_function_name or "",
+                            self.streaming_request.tools
+                            if self.streaming_request else None)
+
+                        # Convert param value to appropriate type
+                        converted_value = self._convert_param_value(
+                            param_value, self.current_param_name, param_config,
+                            self.current_function_name or "")
+
+                        # Build JSON fragment based on the converted type
+                        # Use json.dumps to properly serialize the value
+                        serialized_value = json.dumps(converted_value,
+                                                      ensure_ascii=False)
+
+                        if self.param_count == 0:
+                            json_fragment = (f'"{self.current_param_name}": '
+                                             f'{serialized_value}')
+                        else:
+                            json_fragment = (f', "{self.current_param_name}": '
+                                             f'{serialized_value}')
+
+                        self.param_count += 1
+
+                        return DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                function=DeltaFunctionCall(
+                                    arguments=json_fragment),
+                            )
+                        ])
+
+            # Continue parameter value - Not used in the current implementation
+            # since we process complete parameters above
            if self.in_param:
                if self.parameter_end_token in delta_text:
                    # End of parameter
@@ -608,25 +634,42 @@ class Qwen3CoderToolParser(ToolParser):
                        gt_idx = value_chunk.find(">")
                        value_chunk = value_chunk[gt_idx + 1:]

-                    if (not self.current_param_value
-                            and value_chunk.startswith("\n")):
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
                        value_chunk = value_chunk[1:]

-                    # Calculate incremental JSON
+                    # Store complete value
                    full_value = self.current_param_value + value_chunk
-                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
-                                    if self.current_param_value else "")
-                    full_escaped = json.dumps(full_value)[1:-1]
-                    delta_escaped = full_escaped[len(prev_escaped):]
-
+                    self.accumulated_params[
+                        self.current_param_name] = full_value
+
+                    # Get parameter configuration for type conversion
+                    param_config = self._get_arguments_config(
+                        self.current_function_name or "",
+                        self.streaming_request.tools
+                        if self.streaming_request else None)
+
+                    # Convert the parameter value to the appropriate type
+                    converted_value = self._convert_param_value(
+                        full_value, self.current_param_name or "",
+                        param_config, self.current_function_name or "")
+
+                    # Serialize the converted value
+                    serialized_value = json.dumps(converted_value,
+                                                  ensure_ascii=False)
+
+                    # Since we've been streaming the quoted version,
+                    # we need to close it properly
+                    # This is complex - for now just complete the value
                    self.in_param = False
                    self.current_param_value = ""

+                    # Just close the current parameter string
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
                            function=DeltaFunctionCall(
-                                arguments=delta_escaped + '"'),
+                                arguments='"'),  # Close the string quote
                        )
                    ])
                else:
@@ -638,18 +681,18 @@ class Qwen3CoderToolParser(ToolParser):
                        gt_idx = value_chunk.find(">")
                        value_chunk = value_chunk[gt_idx + 1:]

-                    if (not self.current_param_value
-                            and value_chunk.startswith("\n")):
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
                        value_chunk = value_chunk[1:]

                    if value_chunk:
                        # Stream the escaped delta
-                        prev_escaped = (json.dumps(
-                            self.current_param_value)[1:-1]
-                                        if self.current_param_value else "")
+                        prev_escaped = json.dumps(
+                            self.current_param_value, ensure_ascii=False
+                        )[1:-1] if self.current_param_value else ""
                        self.current_param_value += value_chunk
-                        full_escaped = json.dumps(
-                            self.current_param_value)[1:-1]
+                        full_escaped = json.dumps(self.current_param_value,
+                                                  ensure_ascii=False)[1:-1]
                        delta_escaped = full_escaped[len(prev_escaped):]

                        if delta_escaped:
@@ -661,4 +704,4 @@ class Qwen3CoderToolParser(ToolParser):
                                )
                            ])

-        return None
+        return None
\ No newline at end of file
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from qwen3coder xml parser, All rights reserved.
+# ruff: noqa: E501
+
+import ast
+import json
+import uuid
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("seed_oss")
+class SeedOssToolParser(ToolParser):
+    TOOL_CALL_START = "<seed:tool_call>"
+    TOOL_CALL_END = "</seed:tool_call>"
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        # --- streaming state ---
+        self._reset_streaming_state()
+        self.prev_tool_call_arr: list[dict] = []
+
+        self.tool_call_start_token: str = self.TOOL_CALL_START
+        self.tool_call_end_token: str = self.TOOL_CALL_END
+        # Sentinel tokens for streaming mode
+        self.tool_call_prefix: str = "<function="
+        self.function_end_token: str = "</function>"
+        self.parameter_prefix: str = "<parameter="
+        self.parameter_end_token: str = "</parameter>"
+        self.think_start_token: str = "<seed:think>"
+        self.think_end_token: str = "</seed:think>"
+        self.is_tool_call_started: bool = False
+        self.is_thinking_end: bool = False
+        self.failed_count: int = 0
+        self._reset_streaming_state()
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            raise RuntimeError(
+                "Seed_Oss XML parser: tokenizer did not include "
+                "<seed:tool_call> or its closing tag.")
+
+        tool_start_re = re.escape(self.tool_call_start_token)
+        tool_end_re = re.escape(self.tool_call_end_token)
+
+        self.tool_call_complete_regex = re.compile(
+            rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL)
+        self.tool_call_regex = re.compile(
+            rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
+            re.DOTALL)
+
+        self.tool_call_function_regex = re.compile(
+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
+        self.tool_call_parameter_regex = re.compile(
+            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
+
+        logger.info("vLLM Seed-Oss XML tool parser loaded (%s).",
+                    self.__class__.__name__)
+
+    def _generate_tool_call_id(self) -> str:
+        """Generate a unique tool call ID."""
+        return f"call_{uuid.uuid4().hex[:24]}"
+
+    def _reset_streaming_state(self):
+        """Reset all streaming state."""
+        self.current_tool_index = 0
+        self.is_tool_call_started = False
+        self.header_sent = False
+        self.current_tool_id = -1
+        self.current_function_name = None
+        self.current_param_name = None
+        self.current_param_value = ""
+        self.param_count = 0
+        self.in_param = False
+        self.in_function = False
+        self.accumulated_text = ""
+        self.json_started = False
+        self.json_closed = False
+
+    def _parse_xml_function_call(
+            self, function_call_str: str,
+            tools: Optional[list[ChatCompletionToolsParam]]
+    ) -> Optional[ToolCall]:
+
+        def get_arguments_config(func_name: str) -> dict:
+            if tools is None:
+                return {}
+            for config in tools:
+                if not hasattr(config, "type") or not (
+                        hasattr(config, "function")
+                        and hasattr(config.function, "name")):
+                    continue
+                if (config.type == "function"
+                        and config.function.name == func_name):
+                    if not hasattr(config.function, "parameters"):
+                        return {}
+                    params = config.function.parameters
+                    if isinstance(params, dict) and "properties" in params:
+                        return params["properties"]
+                    elif isinstance(params, dict):
+                        return params
+                    else:
+                        return {}
+            logger.warning("Tool '%s' is not defined in the tools list.",
+                           func_name)
+            return {}
+
+        def convert_param_value(param_value: str, param_name: str,
+                                param_config: dict, func_name: str) -> Any:
+            # Handle null value for any type
+            if param_value.lower() == "null":
+                return None
+
+            if param_name not in param_config:
+                if param_config != {}:
+                    logger.warning(
+                        "Parsed parameter '%s' is not defined in "
+                        "the tool parameters for tool '%s', "
+                        "directly returning the string value.", param_name,
+                        func_name)
+                return param_value
+
+            if (isinstance(param_config[param_name], dict)
+                    and "type" in param_config[param_name]):
+                param_type = str(
+                    param_config[param_name]["type"]).strip().lower()
+            else:
+                param_type = "string"
+            if param_type in [
+                    "string", "str", "text", "varchar", "char", "enum"
+            ]:
+                return param_value
+            elif (param_type.startswith("int") or param_type.startswith("uint")
+                  or param_type.startswith("long")
+                  or param_type.startswith("short")
+                  or param_type.startswith("unsigned")):
+                try:
+                    param_value = int(param_value)  # type: ignore
+                except (ValueError, TypeError):
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not an integer in tool "
+                        "'%s', degenerating to string.", param_value,
+                        param_name, func_name)
+                return param_value
+            elif param_type.startswith("num") or param_type.startswith(
+                    "float"):
+                try:
+                    float_param_value = float(param_value)
+                    param_value = float_param_value if float_param_value - int(
+                        float_param_value) != 0 else int(
+                            float_param_value)  # type: ignore
+                except (ValueError, TypeError):
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not a float in tool "
+                        "'%s', degenerating to string.", param_value,
+                        param_name, func_name)
+                return param_value
+            elif param_type in ["boolean", "bool", "binary"]:
+                param_value = param_value.lower()
+                if param_value not in ["true", "false"]:
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not a boolean "
+                        "(`true` of `false`) in tool '%s', degenerating to false.",
+                        param_value, param_name, func_name)
+                return param_value == "true"
+            else:
+                if param_type == "object" or param_type.startswith("dict"):
+                    try:
+                        param_value = json.loads(param_value)
+                        return param_value
+                    except (ValueError, TypeError, json.JSONDecodeError):
+                        logger.warning(
+                            "Parsed value '%s' of parameter '%s' is not a valid JSON "
+                            "object in tool '%s', will try other methods to parse it.",
+                            param_value, param_name, func_name)
+                try:
+                    param_value = ast.literal_eval(param_value)
+                except (ValueError, SyntaxError):
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' cannot be converted via "
+                        "Python `ast.literal_eval()` in tool '%s', degenerating to string.",
+                        param_value, param_name, func_name)
+                return param_value
+
+        # Extract function name
+        end_index = function_call_str.index(">")
+        function_name = function_call_str[:end_index]
+        param_config = get_arguments_config(function_name)
+        parameters = function_call_str[end_index + 1:]
+        param_dict = {}
+        for match in self.tool_call_parameter_regex.findall(parameters):
+            match_text = match[0] if match[0] else match[1]
+            idx = match_text.index(">")
+            param_name = match_text[:idx]
+            param_value = str(match_text[idx + 1:])
+            # Remove prefix and trailing \n
+            if param_value.startswith("\n"):
+                param_value = param_value[1:]
+            if param_value.endswith("\n"):
+                param_value = param_value[:-1]
+
+            param_dict[param_name] = convert_param_value(
+                param_value, param_name, param_config, function_name)
+        return ToolCall(
+            type="function",
+            function=FunctionCall(name=function_name,
+                                  arguments=json.dumps(param_dict,
+                                                       ensure_ascii=False)),
+        )
+
+    def _get_function_calls(self, model_output: str) -> list[str]:
+        # Find all tool calls
+        matched_ranges = self.tool_call_regex.findall(model_output)
+        raw_tool_calls = [
+            match[0] if match[0] else match[1] for match in matched_ranges
+        ]
+
+        # Back-off strategy if no tool_call tags found
+        if len(raw_tool_calls) == 0:
+            raw_tool_calls = [model_output]
+
+        raw_function_calls = []
+        for tool_call in raw_tool_calls:
+            raw_function_calls.extend(
+                self.tool_call_function_regex.findall(tool_call))
+
+        function_calls = [
+            match[0] if match[0] else match[1] for match in raw_function_calls
+        ]
+        return function_calls
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        # Quick check to avoid unnecessary processing
+        if self.tool_call_prefix not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        # Check if both think start and end tokens are present
+        if (self.think_start_token in model_output
+                and self.think_end_token in model_output):
+            # Find the position of think end token
+            think_end_index = model_output.find(self.think_end_token) + len(
+                self.think_end_token)
+            # Extract content after think end token
+            result_content = model_output[think_end_index:]
+            thinking_content = model_output[:think_end_index]
+        else:
+            thinking_content = ""
+            result_content = model_output
+
+        try:
+            function_calls = self._get_function_calls(result_content)
+            if len(function_calls) == 0:
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+            tool_calls = [
+                self._parse_xml_function_call(function_call_str, request.tools)
+                for function_call_str in function_calls
+            ]
+
+            # Populate prev_tool_call_arr for serving layer to set finish_reason
+            self.prev_tool_call_arr.clear()  # Clear previous calls
+            for tool_call in tool_calls:
+                if tool_call:
+                    self.prev_tool_call_arr.append({
+                        "name":
+                        tool_call.function.name,
+                        "arguments":
+                        tool_call.function.arguments,
+                    })
+
+            # Extract content before tool calls
+            tool_call_start_index = result_content.find(
+                self.tool_call_start_token)
+            tool_call_start_index = (
+                tool_call_start_index if tool_call_start_index >= 0 else
+                result_content.find(self.tool_call_prefix))
+            content = thinking_content + result_content[:tool_call_start_index]
+
+            return ExtractedToolCallInformation(
+                tools_called=(len(tool_calls) > 0),
+                tool_calls=tool_calls,
+                content=content if content else None,
+            )
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        # If no delta text, return None unless
+        # it's an EOS token after tool calls
+        if not delta_text:
+            # Check if this is an EOS token after all tool calls are complete
+            # We check for tool calls in the text even if is_tool_call_started
+            # is False because it might have been reset after processing all tools
+            if (delta_token_ids
+                    and self.tool_call_end_token_id not in delta_token_ids):
+                # Count complete tool calls
+                complete_calls = len(
+                    self.tool_call_complete_regex.findall(current_text))
+
+                # If we have completed tool calls and populated prev_tool_call_arr
+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
+                    # Check if all tool calls are closed
+                    open_calls = current_text.count(
+                        self.tool_call_start_token) - current_text.count(
+                            self.tool_call_end_token)
+                    if open_calls == 0:
+                        # Return empty delta message to allow finish_reason processing
+                        return DeltaMessage(content="")
+                elif not self.is_tool_call_started and current_text:
+                    # This is a regular content response that's now complete
+                    return DeltaMessage(content="")
+            return None
+
+        # Check if this is the first call (reset state if needed)
+        if not previous_text:
+            self._reset_streaming_state()
+
+        # Update accumulated text
+        self.accumulated_text = current_text
+
+        # Check if we need to advance to next tool
+        if self.json_closed and not self.in_function:
+            # Check if this tool call has ended
+            tool_ends = current_text.count(self.tool_call_end_token)
+            if tool_ends > self.current_tool_index:
+                # This tool has ended, advance to next
+                self.current_tool_index += 1
+                self.header_sent = False
+                self.param_count = 0
+                self.json_started = False
+                self.json_closed = False
+
+                # Check if there are more tool calls
+                if self.current_tool_index >= current_text.count(
+                        self.tool_call_start_token):
+                    # No more tool calls
+                    self.is_tool_call_started = False
+                # Continue processing next tool
+                return None
+
+        # Check if end thinking
+        if (not self.is_thinking_end
+                and (self.think_end_token_id in delta_token_ids
+                     or self.think_end_token in delta_text)):
+            self.is_thinking_end = True
+
+        # If thinking hasn't ended yet, don't process any tool calls
+        if not self.is_thinking_end:
+            return DeltaMessage(content=delta_text)
+
+        # Handle normal content before tool calls
+        if not self.is_tool_call_started:
+            # Check if tool call is starting
+            if (self.tool_call_start_token_id in delta_token_ids
+                    or self.tool_call_start_token in delta_text):
+                self.is_tool_call_started = True
+                # Return any content before the tool call
+                if self.tool_call_start_token in delta_text:
+                    content_before = delta_text[:delta_text.index(
+                        self.tool_call_start_token)]
+                    if content_before:
+                        return DeltaMessage(content=content_before)
+                return None
+            else:
+                # Check if we're between tool calls - skip whitespace
+                if (current_text.rstrip().endswith(self.tool_call_end_token)
+                        and delta_text.strip() == ""):
+                    # We just ended a tool call, skip whitespace
+                    return None
+                # Normal content, no tool call
+                return DeltaMessage(content=delta_text)
+
+        # Check if we're between tool calls (waiting for next one)
+        # Count tool calls we've seen vs processed
+        tool_starts_count = current_text.count(self.tool_call_start_token)
+        if self.current_tool_index >= tool_starts_count:
+            # We're past all tool calls, shouldn't be here
+            return None
+
+        # We're in a tool call, find the current tool call portion
+        # Need to find the correct tool call based on current_tool_index
+        # Only process tool calls after think_end_token
+        think_end_index = current_text.find(self.think_end_token) + len(
+            self.think_end_token
+        ) if self.think_end_token in current_text else 0
+        tool_starts: list[int] = []
+        idx = think_end_index
+        while True:
+            idx = current_text.find(self.tool_call_start_token, idx)
+            if idx == -1:
+                break
+            tool_starts.append(idx)
+            idx += len(self.tool_call_start_token)
+
+        if self.current_tool_index >= len(tool_starts):
+            # No more tool calls to process yet
+            return None
+
+        tool_start_idx = tool_starts[self.current_tool_index]
+        # Find where this tool call ends (or current position if not ended yet)
+        tool_end_idx = current_text.find(self.tool_call_end_token,
+                                         tool_start_idx)
+        if tool_end_idx == -1:
+            tool_text = current_text[tool_start_idx:]
+        else:
+            tool_text = current_text[tool_start_idx:tool_end_idx +
+                                     len(self.tool_call_end_token)]
+
+        # Looking for function header
+        if not self.header_sent:
+            if self.tool_call_prefix in tool_text:
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
+                func_end = tool_text.find(">", func_start)
+
+                if func_end != -1:
+                    # Found complete function name
+                    self.current_function_name = tool_text[func_start:func_end]
+                    self.current_tool_id = self._generate_tool_call_id(
+                    )  # type: ignore
+                    self.header_sent = True
+                    self.in_function = True
+
+                    # IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call
+                    # This ensures finish_reason="tool_calls" even if parsing isn't complete
+                    already_added = any(
+                        tool.get("name") == self.current_function_name
+                        for tool in self.prev_tool_call_arr)
+                    if not already_added:
+                        self.prev_tool_call_arr.append({
+                            "name": self.current_function_name,
+                            "arguments":
+                            "{}",  # Placeholder, will be updated later
+                        })
+
+                    # Send header with function info
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            id=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                name=self.current_function_name, arguments=""),
+                            type="function",
+                        )
+                    ])
+            return None
+
+        # We've sent header, now handle function body
+        if self.in_function:
+            # Send opening brace if not sent yet
+            if (not self.json_started
+                    and self.parameter_prefix not in delta_text):
+                self.json_started = True
+                return DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_index,
+                        function=DeltaFunctionCall(arguments="{"),
+                    )
+                ])
+
+            # Make sure json_started is set if we're processing parameters
+            if not self.json_started:
+                self.json_started = True
+
+            # Check for function end in accumulated text
+            if not self.json_closed and self.function_end_token in tool_text:
+                # Close JSON
+                self.json_closed = True
+
+                # Extract the complete tool call to update prev_tool_call_arr with final arguments
+                # Find the function content
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
+                func_content_end = tool_text.find(self.function_end_token,
+                                                  func_start)
+                if func_content_end != -1:
+                    func_content = tool_text[func_start:func_content_end]
+                    # Parse to get the complete arguments
+                    try:
+                        parsed_tool = self._parse_xml_function_call(
+                            func_content, request.tools if request else None)
+                        if parsed_tool:
+                            # Update existing entry in prev_tool_call_arr with complete arguments
+                            for i, tool in enumerate(self.prev_tool_call_arr):
+                                if tool.get(
+                                        "name") == parsed_tool.function.name:
+                                    self.prev_tool_call_arr[i]["arguments"] = (
+                                        parsed_tool.function.arguments)
+                                    break
+                    except Exception:
+                        logger.warning(
+                            "Failed to parse tool arguments during streaming.",
+                            exc_info=True)
+
+                result = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_index,
+                        function=DeltaFunctionCall(arguments="}"),
+                    )
+                ])
+
+                # Reset state for next tool
+                self.in_function = False
+                self.json_closed = True
+
+                return result
+
+            # Look for parameters
+            # Count how many complete parameters we have processed
+            complete_params = tool_text.count(self.parameter_end_token)
+
+            # Check if we should start a new parameter
+            if not self.in_param and self.param_count < complete_params:
+                # Find the unprocessed parameter
+                # Count parameter starts
+                param_starts = []
+                idx = 0
+                while True:
+                    idx = tool_text.find(self.parameter_prefix, idx)
+                    if idx == -1:
+                        break
+                    param_starts.append(idx)
+                    idx += len(self.parameter_prefix)
+
+                if len(param_starts) > self.param_count:
+                    # Process the next parameter
+                    param_idx = param_starts[self.param_count]
+                    param_start = param_idx + len(self.parameter_prefix)
+                    remaining = tool_text[param_start:]
+
+                    if ">" in remaining:
+                        # We have the complete parameter name
+                        name_end = remaining.find(">")
+                        self.current_param_name = remaining[:name_end]
+
+                        # Find the parameter value
+                        value_start = param_start + name_end + 1
+                        value_text = tool_text[value_start:]
+                        if value_text.startswith("\n"):
+                            value_text = value_text[1:]
+
+                        # Find where this parameter ends
+                        param_end_idx = value_text.find(
+                            self.parameter_end_token)
+                        if param_end_idx != -1:
+                            # Complete parameter found
+                            param_value = value_text[:param_end_idx]
+                            if param_value.endswith("\n"):
+                                param_value = param_value[:-1]
+
+                            # Build complete JSON fragment for this parameter
+                            if self.param_count == 0:
+                                json_fragment = (
+                                    '"' + self.current_param_name + '": "' +
+                                    json.dumps(param_value)[1:-1] + '"')
+                            else:
+                                json_fragment = (
+                                    ', "' + self.current_param_name + '": "' +
+                                    json.dumps(param_value)[1:-1] + '"')
+
+                            self.param_count += 1
+
+                            return DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(
+                                        arguments=json_fragment),
+                                )
+                            ])
+
+            # Continue parameter value
+            if self.in_param:
+                if self.parameter_end_token in delta_text:
+                    # End of parameter
+                    end_idx = delta_text.find(self.parameter_end_token)
+                    value_chunk = delta_text[:end_idx]
+
+                    # Skip past > if at start
+                    if not self.current_param_value and ">" in value_chunk:
+                        gt_idx = value_chunk.find(">")
+                        value_chunk = value_chunk[gt_idx + 1:]
+
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
+                        value_chunk = value_chunk[1:]
+
+                    # Calculate incremental JSON
+                    full_value = self.current_param_value + value_chunk
+                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
+                                    if self.current_param_value else "")
+                    full_escaped = json.dumps(full_value)[1:-1]
+                    delta_escaped = full_escaped[len(prev_escaped):]
+
+                    self.in_param = False
+                    self.current_param_value = ""
+
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(
+                                arguments=delta_escaped + '"'),
+                        )
+                    ])
+                else:
+                    # Continue accumulating value
+                    value_chunk = delta_text
+
+                    # Handle first chunk after param name
+                    if not self.current_param_value and ">" in value_chunk:
+                        gt_idx = value_chunk.find(">")
+                        value_chunk = value_chunk[gt_idx + 1:]
+
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
+                        value_chunk = value_chunk[1:]
+
+                    if value_chunk:
+                        # Stream the escaped delta
+                        prev_escaped = (json.dumps(
+                            self.current_param_value)[1:-1]
+                                        if self.current_param_value else "")
+                        self.current_param_value += value_chunk
+                        full_escaped = json.dumps(
+                            self.current_param_value)[1:-1]
+                        delta_escaped = full_escaped[len(prev_escaped):]
+
+                        if delta_escaped:
+                            return DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(
+                                        arguments=delta_escaped),
+                                )
+                            ])
+
+        return None
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -7,7 +7,7 @@ from typing import Any, Optional, Union

 import regex as re

-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -186,11 +186,31 @@ class xLAMToolParser(ToolParser):
        """
        Extract tool calls for streaming mode.
        """
-        # Simplify detection: if it begins with "[" treat it as a function call
-        is_function_call = (current_text.strip().startswith("["))
-
-        # If not a function call, return normal content
-        if not is_function_call:
+        # First, check for a definitive start of a tool call block.
+        # This prevents premature parsing of incomplete output.
+        stripped_text = current_text.strip()
+        preprocessed_content, preprocessed_tool_calls = (
+            self.preprocess_model_output(current_text))
+
+        # For JSON code blocks, we need to detect them earlier, even if incomplete
+        has_potential_json_block = ("```json" in current_text
+                                    or "```\n[" in current_text
+                                    or "[TOOL_CALLS]" in current_text
+                                    or "<tool_call>" in current_text)
+
+        is_tool_call_block = (
+            stripped_text.startswith("[")
+            or stripped_text.startswith("<tool_call>")
+            or stripped_text.startswith("[TOOL_CALLS]") or
+            # Check if we have thinking tags with JSON-like content following
+            ("</think>[" in current_text) or
+            # Check if the text contains a JSON array after preprocessing
+            preprocessed_tool_calls is not None or
+            # For JSON code blocks, detect early if we see enough structure
+            (has_potential_json_block and '"name"' in current_text
+             and '"arguments"' in current_text))
+
+        if not is_tool_call_block:
            return DeltaMessage(content=delta_text)

        try:
@@ -204,7 +224,10 @@ class xLAMToolParser(ToolParser):

            # Try parsing as JSON to check for complete tool calls
            try:
-                parsed_tools = json.loads(current_text)
+                # Use preprocessed tool calls if available
+                tool_calls_text = (preprocessed_tool_calls if
+                                   preprocessed_tool_calls else current_text)
+                parsed_tools = json.loads(tool_calls_text)
                if isinstance(parsed_tools, list):
                    # Update our tool array for next time
                    self.prev_tool_call_arr = parsed_tools
@@ -226,7 +249,7 @@ class xLAMToolParser(ToolParser):
                        function_name = name_match.group(1)

                        # The test expects us to send just the name first
-                        tool_id = random_tool_call_id()
+                        tool_id = make_tool_call_id()
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=0,
@@ -257,13 +280,40 @@ class xLAMToolParser(ToolParser):
                        return delta

            # Use regex to identify tool calls in the output
+            # Use preprocessed tool calls text for better parsing, but also try to extract from incomplete JSON blocks
+            search_text = (preprocessed_tool_calls
+                           if preprocessed_tool_calls else current_text)
+
+            # For JSON code blocks that aren't complete yet, try to extract the JSON content
+            if not preprocessed_tool_calls and has_potential_json_block:
+                # Try to extract the JSON array from within the code block
+                json_match = re.search(r"```(?:json)?\s*([\s\S]*?)(?:```|$)",
+                                       current_text)
+                if json_match:
+                    potential_json = json_match.group(1).strip()
+                    # Use this as search text even if it's incomplete
+                    if potential_json.startswith("[") and (
+                            '"name"' in potential_json
+                            and '"arguments"' in potential_json):
+                        search_text = potential_json
+
+            # Try to find complete tool names first
            name_pattern = r'"name"\s*:\s*"([^"]+)"'
-            name_matches = list(re.finditer(name_pattern, current_text))
+            name_matches = list(re.finditer(name_pattern, search_text))
            tool_count = len(name_matches)

-            # If no tools found yet, return
+            # If no complete tool names found, check for partial tool names
            if tool_count == 0:
-                return None
+                # Check if we're in the middle of parsing a tool name
+                partial_name_pattern = r'"name"\s*:\s*"([^"]*)'
+                partial_matches = list(
+                    re.finditer(partial_name_pattern, search_text))
+                if partial_matches:
+                    # We have a partial tool name - not ready to emit yet
+                    return None
+                else:
+                    # No tools found at all
+                    return None

            # Ensure our state arrays are large enough
            while len(self.streaming_state["sent_tools"]) < tool_count:
@@ -332,7 +382,7 @@ class xLAMToolParser(ToolParser):
                # First, check for the empty arguments case: "arguments": {}
                empty_args_pattern = (
                    r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
-                empty_args_match = re.search(empty_args_pattern, current_text)
+                empty_args_match = re.search(empty_args_pattern, search_text)

                # Check if this tool has empty arguments
                if empty_args_match and empty_args_match.start() > 0:
@@ -376,7 +426,7 @@ class xLAMToolParser(ToolParser):

                # Extract arguments for current tool using regex for non-empty arguments
                args_pattern = r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
-                args_matches = list(re.finditer(args_pattern, current_text))
+                args_matches = list(re.finditer(args_pattern, search_text))

                if current_idx < len(args_matches):
                    args_text = args_matches[current_idx].group(1)
@@ -384,17 +434,25 @@ class xLAMToolParser(ToolParser):
                    # Handle transition between tools
                    is_last_tool = current_idx == tool_count - 1

-                    # Find where the arguments for our current tool end
-                    if not is_last_tool:
-                        # If we have more tools after this one, try to find the complete argument block
-                        next_tool_pos = current_text.find(
-                            "},{", args_matches[current_idx].start())
-                        if next_tool_pos != -1:
-                            args_end_pos = (next_tool_pos + 1
-                                            )  # +1 to include the '}'
-                            args_text = (current_text[args_matches[current_idx]
-                                                      .start():args_end_pos].
-                                         split('"arguments":')[1].strip())
+                    # For multiple tools, extract only the arguments for the current tool
+                    if tool_count > 1:
+                        # Parse the entire JSON structure to properly extract arguments for each tool
+                        try:
+                            parsed_tools = json.loads(search_text)
+                            if isinstance(
+                                    parsed_tools,
+                                    list) and current_idx < len(parsed_tools):
+                                current_tool = parsed_tools[current_idx]
+                                if isinstance(current_tool.get("arguments"),
+                                              dict):
+                                    args_text = json.dumps(
+                                        current_tool["arguments"])
+                                else:
+                                    args_text = str(
+                                        current_tool.get("arguments", "{}"))
+                        except (json.JSONDecodeError, KeyError, IndexError):
+                            # Fallback to regex-based extraction
+                            pass

                    # If arguments haven't been sent yet
                    sent_args = self.streaming_state["sent_tools"][
@@ -419,7 +477,7 @@ class xLAMToolParser(ToolParser):
                                index=current_idx,
                                function=DeltaFunctionCall(
                                    arguments="{").model_dump(
-                                        exclude_none=True),  # type: ignore  
+                                        exclude_none=True),  # type: ignore
                            )
                        ])
                        return delta

--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -313,12 +313,14 @@ def log_non_default_args(args: Union[argparse.Namespace, EngineArgs]):

    # Handle EngineArgs instance
    elif isinstance(args, EngineArgs):
-        default_args = EngineArgs()  # Create default instance
+        default_args = EngineArgs(model=args.model)  # Create default instance
        for field in dataclasses.fields(args):
            current_val = getattr(args, field.name)
            default_val = getattr(default_args, field.name)
            if current_val != default_val:
                non_default_args[field.name] = current_val
+        if default_args.model != EngineArgs.model:
+            non_default_args["model"] = default_args.model
    else:
        raise TypeError("Unsupported argument type. " \
        "Must be argparse.Namespace or EngineArgs instance.")