Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

4eabe123 · zhuwenwen · 45840cd2 · 58738772 · 4eabe123 · 4eabe123
Commit 4eabe123 authored May 28, 2025 by zhuwenwen
20 changed files
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -29,7 +29,7 @@ prometheus_client.disable_created_metrics()
 # to extract the metrics definitions.


-# begin-metrics-definitions
+# --8<-- [start:metrics-definitions]
 class Metrics:
    """
    vLLM uses a multiprocessing-based frontend for the OpenAI server.
@@ -293,7 +293,7 @@ class Metrics:
            labelnames=labelnames))


-# end-metrics-definitions
+# --8<-- [end:metrics-definitions]

    def _unregister_vllm_metrics(self) -> None:
        for collector in list(prometheus_client.REGISTRY._collector_to_names):

--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -492,8 +492,9 @@ class MQLLMEngineClient(EngineClient):
        from the LLMEngine to the caller.

        Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
-                for more details about the format of each input.
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
            sampling_params: The sampling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
@@ -561,8 +562,9 @@ class MQLLMEngineClient(EngineClient):
        from the LLMEngine to the caller.

        Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
-                for more details about the format of each input.
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
            pooling_params: The pooling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.

--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -42,19 +42,22 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )


 class MQLLMEngine:
-    """A multiprocessing wrapper for {class}`LLMEngine`.
+    """A multiprocessing wrapper for
+    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].

-    This class is used to wrap the {class}`LLMEngine` class to enable use
+    This class is used to wrap the
+    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
    in concurrnet manner. It runs a background loop and uses zeromq to
    receive new requests and stream outputs incrementally via ipc.

-    The {class}`LLMEngine` generate or encode process is kicked off when a new
-    RPCProcessRequest is received by the input_socket.
+    The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
+    process is kicked off when a new RPCProcessRequest is received by the
+    input_socket.

    The self.engine_loop checks the input_socket for new requests,
    adds them to the LLMEngine if there are any, calls the internal
-    {class}`LLMEngine.step()`, and sends the RequestOutputs back over
-    the output_socket.
+    [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
+    the RequestOutputs back over the output_socket.

    If use_async_sockets is set, the logic associated with reading new
    requests from the socket and sending data to the socket is passed
@@ -65,8 +68,8 @@ class MQLLMEngine:
        ipc_path: Base path for zeromq interprocess messaging
        use_async_sockets: Whether to make send/recv async with GPU
        log_requests: Whether to log the requests.
-        *args: Arguments for {class}`LLMEngine`.
-        **kwargs: Arguments for {class}`LLMEngine`.
+        *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
+        **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
    """

    def __init__(self,

--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -56,8 +56,11 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
        scheduled computation.

        Args:
-          seq_group: the outputs are associated with this {class}`SequenceGroup`
-          outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
+          seq_group: the outputs are associated with this
+              [`SequenceGroup`][vllm.sequence.SequenceGroup]
+          outputs: the
+              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
+              for all scheduler steps
        """
        for output in outputs:
            # Concatenate single-step prompt logprob processing results.
@@ -67,7 +70,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
    @staticmethod
    @functools.lru_cache
    def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # Reminder: Please update docs/features/compatibility_matrix.md
        # If the feature combo become valid
        logger.warning(
            "Prompt logprob is not supported by multi step workers. "

--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -19,17 +19,21 @@ logger = init_logger(__name__)
 def single_step_process_prompt_logprob(
        sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
        output: CompletionSequenceGroupOutput) -> None:
-    """Process prompt logprobs associated with the {class}`SequenceGroupOutput`
-    for a given step.
+    """Process prompt logprobs associated with the
+    [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.

    Do nothing if the output has no prompt logprobs.

    Account for the fact that transformers do not compute first-token logprobs.
    
    Args:
-      sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
-      seq_group: the output is associated with this {class}`SequenceGroup`
-      output: the {class}`SequenceGroupOutput` for a single scheduler step
+      sg_output_proc:
+          [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
+          instance
+      seq_group: the output is associated with this
+          [`SequenceGroup`][vllm.sequence.SequenceGroup]
+      output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
+          for a single scheduler step
    """
    prompt_logprobs = output.prompt_logprobs

@@ -103,8 +107,11 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
        scheduled computation.
        
        Args:
-          seq_group: the output is associated with this {class}`SequenceGroup`
-          outputs: the {class}`SequenceGroupOutput` for a single scheduler step
+          seq_group: the output is associated with this
+              [`SequenceGroup`][vllm.sequence.SequenceGroup]
+          outputs: the
+              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
+              for a single scheduler step
        """
        assert len(outputs) == 1, "Single step should only have 1 output."
        output = outputs[0]

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -556,6 +556,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                return "(<audio>./</audio>)"
            raise TypeError(f"Unknown model type: {model_type}")
        elif modality == "video":
+            if model_type == "internvl_chat":
+                return "<video>"
            if model_type in ("qwen2_vl", "qwen2_5_vl"):
                return "<|vision_start|><|video_pad|><|vision_end|>"
            if model_type == "qwen2_5_omni":

--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -9,7 +9,7 @@ import vllm.entrypoints.cli.collect_env
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
-from vllm.entrypoints.utils import cli_env_setup
+from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup
 from vllm.utils import FlexibleArgumentParser

 CMD_MODULES = [
@@ -32,7 +32,10 @@ def register_signal_handlers():
 def main():
    cli_env_setup()

-    parser = FlexibleArgumentParser(description="vLLM CLI")
+    parser = FlexibleArgumentParser(
+        description="vLLM CLI",
+        epilog=VLLM_SERVE_PARSER_EPILOG,
+    )
    parser.add_argument('-v',
                        '--version',
                        action='version',

--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -11,6 +11,8 @@ from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                              validate_parsed_serve_args)
+from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, get_tcp_uri
@@ -77,7 +79,10 @@ class ServeSubcommand(CLISubcommand):
            "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
        )

-        return make_arg_parser(serve_parser)
+        serve_parser = make_arg_parser(serve_parser)
+        show_filtered_argument_or_group_from_help(serve_parser)
+        serve_parser.epilog = VLLM_SERVE_PARSER_EPILOG
+        return serve_parser


 def cmd_init() -> list[CLISubcommand]:

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -4,7 +4,8 @@ import itertools
 import warnings
 from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, ClassVar, Optional, Union, cast, overload
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
+                    cast, overload)

 import cloudpickle
 import torch.nn as nn
@@ -47,6 +48,9 @@ from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
                        is_list_of)

+if TYPE_CHECKING:
+    from vllm.v1.metrics.reader import Metric
+
 logger = init_logger(__name__)

 _R = TypeVar("_R", default=Any)
@@ -116,7 +120,8 @@ class LLM:
            to eager mode. Additionally for encoder-decoder models, if the
            sequence length of the encoder input is larger than this, we fall
            back to the eager mode.
-        disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
+        disable_custom_all_reduce: See
+            [ParallelConfig][vllm.config.ParallelConfig].
        disable_async_output_proc: Disable async output processing.
            This may result in lower performance.
        hf_token: The token to use as HTTP bearer authorization for remote files
@@ -128,13 +133,11 @@ class LLM:
        compilation_config: Either an integer or a dictionary. If it is an
            integer, it is used as the level of compilation optimization. If it
            is a dictionary, it can specify the full compilation configuration.
-        **kwargs: Arguments for {class}`~vllm.EngineArgs`. (See
-            {ref}`engine-args`)
+        **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].

-    :::{note}
-    This class is intended to be used for offline inference. For online
-    serving, use the {class}`~vllm.AsyncLLMEngine` class instead.
-    :::
+    Note:
+        This class is intended to be used for offline inference. For online
+        serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead.
    """

    DEPRECATE_LEGACY: ClassVar[bool] = True
@@ -143,7 +146,7 @@ class LLM:
    DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
    """
    A flag to toggle whether to deprecate positional arguments in
-    {meth}`LLM.__init__`.
+    [LLM.__init__][].
    """

    @classmethod
@@ -404,7 +407,7 @@ class LLM:

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See {class}`~vllm.inputs.PromptType`
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            sampling_params: The sampling parameters for text generation. If
                None, we use the default sampling parameters.
@@ -422,11 +425,10 @@ class LLM:
            A list of `RequestOutput` objects containing the
            generated completions in the same order as the input prompts.

-        :::{note}
-        Using `prompts` and `prompt_token_ids` as keyword parameters is
-        considered legacy and may be deprecated in the future. You should
-        instead pass them via the `inputs` parameter.
-        :::
+        Note:
+            Using `prompts` and `prompt_token_ids` as keyword parameters is
+            considered legacy and may be deprecated in the future. You should
+            instead pass them via the `inputs` parameter.
        """
        runner_type = self.llm_engine.model_config.runner_type
        if runner_type not in ["generate", "transcription"]:
@@ -495,17 +497,16 @@ class LLM:
                `self` argument, in addition to the arguments passed in `args`
                and `kwargs`. The `self` argument will be the worker object.
            timeout: Maximum time in seconds to wait for execution. Raises a
-                {exc}`TimeoutError` on timeout. `None` means wait indefinitely.
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
            args: Positional arguments to pass to the worker method.
            kwargs: Keyword arguments to pass to the worker method.

        Returns:
            A list containing the results from each worker.

-        :::{note}
-        It is recommended to use this API to only pass control messages,
-        and set up data-plane communication to pass data.
-        :::
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
        """

        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
@@ -672,7 +673,7 @@ class LLM:
        Generate responses for a chat conversation.

        The chat conversation is converted into a text prompt using the
-        tokenizer and calls the {meth}`generate` method to generate the
+        tokenizer and calls the [generate][] method to generate the
        responses.

        Multi-modal inputs can be passed in the same way you would pass them
@@ -681,8 +682,8 @@ class LLM:
        Args:
            messages: A list of conversations or a single conversation.

-              - Each conversation is represented as a list of messages.
-              - Each message is a dictionary with 'role' and 'content' keys.
+                - Each conversation is represented as a list of messages.
+                - Each message is a dictionary with 'role' and 'content' keys.

            sampling_params: The sampling parameters for text generation.
                If None, we use the default sampling parameters. When it
@@ -692,27 +693,27 @@ class LLM:
            use_tqdm: Whether to use tqdm to display the progress bar.
            lora_request: LoRA request to use for generation, if any.
            chat_template: The template to use for structuring the chat.
-              If not provided, the model's default chat template will be used.
+                If not provided, the model's default chat template will be used.
            chat_template_content_format: The format to render message content.

-              - "string" will render the content as a string.
-                Example: ``"Who are you?"``
-              - "openai" will render the content as a list of dictionaries,
-                similar to OpenAI schema.
-                Example: ``[{"type": "text", "text": "Who are you?"}]``
+                - "string" will render the content as a string.
+                  Example: `"Who are you?"`
+                - "openai" will render the content as a list of dictionaries,
+                  similar to OpenAI schema.
+                  Example: `[{"type": "text", "text": "Who are you?"}]`

            add_generation_prompt: If True, adds a generation template
                to each message.
            continue_final_message: If True, continues the final message in
                the conversation instead of starting a new one. Cannot be
-                ``True`` if ``add_generation_prompt`` is also ``True``.
+                `True` if `add_generation_prompt` is also `True`.
            chat_template_kwargs: Additional kwargs to pass to the chat
                template.
            mm_processor_kwargs: Multimodal processor kwarg overrides for this
                chat request. Only used for offline requests.

        Returns:
-            A list of ``RequestOutput`` objects containing the generated
+            A list of `RequestOutput` objects containing the generated
            responses in the same order as the input messages.
        """
        list_of_messages: list[list[ChatCompletionMessageParam]]
@@ -911,7 +912,7 @@ class LLM:

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See {class}`~vllm.inputs.PromptType`
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
@@ -924,11 +925,10 @@ class LLM:
            A list of `PoolingRequestOutput` objects containing the
            pooled hidden states in the same order as the input prompts.

-        :::{note}
-        Using `prompts` and `prompt_token_ids` as keyword parameters is
-        considered legacy and may be deprecated in the future. You should
-        instead pass them via the `inputs` parameter.
-        :::
+        Note:
+            Using `prompts` and `prompt_token_ids` as keyword parameters is
+            considered legacy and may be deprecated in the future. You should
+            instead pass them via the `inputs` parameter.
        """
        runner_type = self.llm_engine.model_config.runner_type
        if runner_type != "pooling":
@@ -1001,7 +1001,7 @@ class LLM:

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See {class}`~vllm.inputs.PromptType`
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
@@ -1011,7 +1011,7 @@ class LLM:
                generation, if any.

        Returns:
-            A list of ``EmbeddingRequestOutput`` objects containing the
+            A list of `EmbeddingRequestOutput` objects containing the
            embedding vectors in the same order as the input prompts.
        """
        if self.llm_engine.model_config.task != "embed":
@@ -1045,7 +1045,7 @@ class LLM:

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See {class}`~vllm.inputs.PromptType`
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            use_tqdm: Whether to use tqdm to display the progress bar.
            lora_request: LoRA request to use for generation, if any.
@@ -1053,7 +1053,7 @@ class LLM:
                generation, if any.

        Returns:
-            A list of ``ClassificationRequestOutput`` objects containing the
+            A list of `ClassificationRequestOutput` objects containing the
            embedding vectors in the same order as the input prompts.
        """
        if self.llm_engine.model_config.task != "classify":
@@ -1163,11 +1163,11 @@ class LLM:
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> list[ScoringRequestOutput]:
-        """Generate similarity scores for all pairs ``<text,text_pair>``.
+        """Generate similarity scores for all pairs `<text,text_pair>`.

-        The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
-        In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N``
-        times to pair with the ``text_2`` sentences.
+        The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
+        In the `1 - N` case the `text_1` sentence will be replicated `N`
+        times to pair with the `text_2` sentences.
        The input pairs are used to build a list of prompts for the
        cross encoder model. This class automatically batches the prompts,
        considering the memory constraint. For the best performance, put all
@@ -1175,9 +1175,9 @@ class LLM:

        Args:
            text_1: can be a single prompt or a list of prompts, in which
-                case it has to have the same length as the ``text_2`` list
+                case it has to have the same length as the `text_2` list
            text_2: The texts to pair with the query to form the input
-                to the LLM. See {class}`~vllm.inputs.PromptType` for
+                to the LLM. See [PromptType][vllm.inputs.PromptType] for
                more details about the format of each prompts.
            use_tqdm: Whether to use tqdm to display the progress bar.
            lora_request: LoRA request to use for generation, if any.
@@ -1185,7 +1185,7 @@ class LLM:
                generation, if any.

        Returns:
-            A list of ``ScoringRequestOutput`` objects containing the
+            A list of `ScoringRequestOutput` objects containing the
            generated scores in the same order as the input prompts.
        """
        runner_type = self.llm_engine.model_config.runner_type
@@ -1286,18 +1286,32 @@ class LLM:

    def wake_up(self, tags: Optional[list[str]] = None):
        """
-        Wake up the engine from sleep mode. See the {meth}`sleep` method
+        Wake up the engine from sleep mode. See the [sleep][] method
        for more details.
        
        Args:
            tags: An optional list of tags to reallocate the engine memory 
                for specific memory allocations. Values must be in 
-                ("weights", "kv_cache",). If None, all memory is reallocated.
+                `("weights", "kv_cache")`. If None, all memory is reallocated.
                wake_up should be called with all tags (or None) before the 
                engine is used again.
        """
        self.llm_engine.wake_up(tags)

+    def get_metrics(self) -> list["Metric"]:
+        """Return a snapshot of aggregated metrics from Prometheus.
+
+        Returns:
+            A ``MetricSnapshot`` instance capturing the current state
+            of all aggregated metrics from Prometheus.
+
+        Note:
+            This method is only available with the V1 LLM engine.
+        """
+        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+        assert isinstance(self.llm_engine, V1LLMEngine)
+        return self.llm_engine.get_metrics()
+
    # LEGACY
    def _convert_v1_inputs(
        self,
@@ -1306,27 +1320,25 @@ class LLM:
    ):
        # skip_tokenizer_init is now checked in engine

+        if prompts is None and prompt_token_ids is None:
+            raise ValueError(
+                "Either prompts or prompt_token_ids must be provided.")
+        if prompts is not None and prompt_token_ids is not None \
+                and len(prompts) != len(prompt_token_ids):
+            raise ValueError(
+                "The lengths of prompts and prompt_token_ids must be the same."
+            )
+
        if prompts is not None:
            prompts = [p["content"] for p in parse_and_batch_prompt(prompts)]
        if prompt_token_ids is not None:
            prompt_token_ids = [
                p["content"] for p in parse_and_batch_prompt(prompt_token_ids)
            ]
-
-        num_requests = None
        if prompts is not None:
            num_requests = len(prompts)
-        if prompt_token_ids is not None:
-            if (num_requests is not None
-                    and num_requests != len(prompt_token_ids)):
-                raise ValueError("The lengths of prompts and prompt_token_ids "
-                                 "must be the same.")
-
+        elif prompt_token_ids is not None:
            num_requests = len(prompt_token_ids)
-        if num_requests is None:
-            raise ValueError("Either prompts or prompt_token_ids must be "
-                             "provided.")
-
        parsed_prompts: list[PromptType] = []
        for i in range(num_requests):
            item: PromptType

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -7,7 +7,6 @@ import importlib
 import inspect
 import multiprocessing
 import os
-import re
 import signal
 import socket
 import tempfile
@@ -21,6 +20,7 @@ from json import JSONDecodeError
 from typing import Annotated, Optional, Union

 import prometheus_client
+import regex as re
 import uvloop
 from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError

--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -3,11 +3,11 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import json
-import re
 import time
 from http import HTTPStatus
 from typing import Annotated, Any, ClassVar, Literal, Optional, Union

+import regex as re
 import torch
 from fastapi import HTTPException, UploadFile
 from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
@@ -251,7 +251,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
    parallel_tool_calls: Optional[bool] = False
    user: Optional[str] = None

-    # doc: begin-chat-completion-sampling-params
+    # --8<-- [start:chat-completion-sampling-params]
    best_of: Optional[int] = None
    use_beam_search: bool = False
    top_k: Optional[int] = None
@@ -266,9 +266,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
    spaces_between_special_tokens: bool = True
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
    prompt_logprobs: Optional[int] = None
-    # doc: end-chat-completion-sampling-params
+    # --8<-- [end:chat-completion-sampling-params]

-    # doc: begin-chat-completion-extra-params
+    # --8<-- [start:chat-completion-extra-params]
    echo: bool = Field(
        default=False,
        description=(
@@ -407,7 +407,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
        default=None,
        description="KVTransfer parameters used for disaggregated serving.")

-    # doc: end-chat-completion-extra-params
+    # --8<-- [end:chat-completion-extra-params]

    # Default sampling parameters for chat completion requests
    _DEFAULT_SAMPLING_PARAMS: dict = {
@@ -764,7 +764,7 @@ class CompletionRequest(OpenAIBaseModel):
    top_p: Optional[float] = None
    user: Optional[str] = None

-    # doc: begin-completion-sampling-params
+    # --8<-- [start:completion-sampling-params]
    use_beam_search: bool = False
    top_k: Optional[int] = None
    min_p: Optional[float] = None
@@ -779,9 +779,9 @@ class CompletionRequest(OpenAIBaseModel):
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
    allowed_token_ids: Optional[list[int]] = None
    prompt_logprobs: Optional[int] = None
-    # doc: end-completion-sampling-params
+    # --8<-- [end:completion-sampling-params]

-    # doc: begin-completion-extra-params
+    # --8<-- [start:completion-extra-params]
    add_special_tokens: bool = Field(
        default=True,
        description=(
@@ -858,7 +858,7 @@ class CompletionRequest(OpenAIBaseModel):
        default=None,
        description="KVTransfer parameters used for disaggregated serving.")

-    # doc: end-completion-extra-params
+    # --8<-- [end:completion-extra-params]

    # Default sampling parameters for completion requests
    _DEFAULT_SAMPLING_PARAMS: dict = {
@@ -1045,11 +1045,11 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
    user: Optional[str] = None
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

-    # doc: begin-embedding-pooling-params
+    # --8<-- [start:embedding-pooling-params]
    additional_data: Optional[Any] = None
-    # doc: end-embedding-pooling-params
+    # --8<-- [end:embedding-pooling-params]

-    # doc: begin-embedding-extra-params
+    # --8<-- [start:embedding-extra-params]
    add_special_tokens: bool = Field(
        default=True,
        description=(
@@ -1064,7 +1064,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
            "if the served model does not use priority scheduling."),
    )

-    # doc: end-embedding-extra-params
+    # --8<-- [end:embedding-extra-params]

    def to_pooling_params(self):
        return PoolingParams(dimensions=self.dimensions,
@@ -1080,11 +1080,11 @@ class EmbeddingChatRequest(OpenAIBaseModel):
    user: Optional[str] = None
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

-    # doc: begin-chat-embedding-pooling-params
+    # --8<-- [start:chat-embedding-pooling-params]
    additional_data: Optional[Any] = None
-    # doc: end-chat-embedding-pooling-params
+    # --8<-- [end:chat-embedding-pooling-params]

-    # doc: begin-chat-embedding-extra-params
+    # --8<-- [start:chat-embedding-extra-params]
    add_special_tokens: bool = Field(
        default=False,
        description=(
@@ -1118,7 +1118,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
-    # doc: end-chat-embedding-extra-params
+    # --8<-- [end:chat-embedding-extra-params]

    @model_validator(mode="before")
    @classmethod
@@ -1147,11 +1147,11 @@ class ScoreRequest(OpenAIBaseModel):
    text_2: Union[list[str], str]
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

-    # doc: begin-score-pooling-params
+    # --8<-- [start:score-pooling-params]
    additional_data: Optional[Any] = None
-    # doc: end-score-pooling-params
+    # --8<-- [end:score-pooling-params]

-    # doc: begin-score-extra-params
+    # --8<-- [start:score-extra-params]
    priority: int = Field(
        default=0,
        description=(
@@ -1160,7 +1160,7 @@ class ScoreRequest(OpenAIBaseModel):
            "if the served model does not use priority scheduling."),
    )

-    # doc: end-score-extra-params
+    # --8<-- [end:score-extra-params]

    def to_pooling_params(self):
        return PoolingParams(additional_data=self.additional_data)
@@ -1173,11 +1173,11 @@ class RerankRequest(OpenAIBaseModel):
    top_n: int = Field(default_factory=lambda: 0)
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

-    # doc: begin-rerank-pooling-params
+    # --8<-- [start:rerank-pooling-params]
    additional_data: Optional[Any] = None
-    # doc: end-rerank-pooling-params
+    # --8<-- [end:rerank-pooling-params]

-    # doc: begin-rerank-extra-params
+    # --8<-- [start:rerank-extra-params]
    priority: int = Field(
        default=0,
        description=(
@@ -1186,7 +1186,7 @@ class RerankRequest(OpenAIBaseModel):
            "if the served model does not use priority scheduling."),
    )

-    # doc: end-rerank-extra-params
+    # --8<-- [end:rerank-extra-params]

    def to_pooling_params(self):
        return PoolingParams(additional_data=self.additional_data)
@@ -1321,11 +1321,11 @@ class ClassificationRequest(OpenAIBaseModel):
    truncate_prompt_tokens: Optional[int] = None
    user: Optional[str] = None

-    # doc: begin-classification-pooling-params
+    # --8<-- [start:classification-pooling-params]
    additional_data: Optional[Any] = None
-    # doc: end-classification-pooling-params
+    # --8<-- [end:classification-pooling-params]

-    # doc: begin-classification-extra-params
+    # --8<-- [start:classification-extra-params]
    priority: int = Field(
        default=0,
        description=(
@@ -1334,7 +1334,7 @@ class ClassificationRequest(OpenAIBaseModel):
            "if the served model does not use priority scheduling."),
    )

-    # doc: end-classification-extra-params
+    # --8<-- [end:classification-extra-params]

    def to_pooling_params(self):
        return PoolingParams(additional_data=self.additional_data)
@@ -1698,7 +1698,7 @@ class TranscriptionRequest(OpenAIBaseModel):
    timestamps incurs additional latency.
    """

-    # doc: begin-transcription-extra-params
+    # --8<-- [start:transcription-extra-params]
    stream: Optional[bool] = False
    """Custom field not present in the original OpenAI definition. When set,
    it will enable output to be streamed in a similar fashion as the Chat
@@ -1707,9 +1707,9 @@ class TranscriptionRequest(OpenAIBaseModel):
    # Flattened stream option to simplify form data.
    stream_include_usage: Optional[bool] = False
    stream_continuous_usage_stats: Optional[bool] = False
-    # doc: end-transcription-extra-params
+    # --8<-- [end:transcription-extra-params]

-    # doc: begin-transcription-sampling-params
+    # --8<-- [start:transcription-sampling-params]
    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

@@ -1743,7 +1743,7 @@ class TranscriptionRequest(OpenAIBaseModel):

    presence_penalty: Optional[float] = 0.0
    """The presence penalty to use for sampling."""
-    # doc: end-transcription-sampling-params
+    # --8<-- [end:transcription-sampling-params]

    # Default sampling parameters for transcription requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {

--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -365,8 +365,8 @@ async def main(args):

        # Determine the type of request and run it.
        if request.url == "/v1/chat/completions":
-            chat_handler_fn = (None if openai_serving_chat is None else
-                               openai_serving_chat.create_chat_completion)
+            chat_handler_fn = openai_serving_chat.create_chat_completion if \
+                openai_serving_chat is not None else None
            if chat_handler_fn is None:
                response_futures.append(
                    make_async_error_request_output(
@@ -380,8 +380,8 @@ async def main(args):
                run_request(chat_handler_fn, request, tracker))
            tracker.submitted()
        elif request.url == "/v1/embeddings":
-            embed_handler_fn = (None if openai_serving_embedding is None else
-                                openai_serving_embedding.create_embedding)
+            embed_handler_fn = openai_serving_embedding.create_embedding if \
+                openai_serving_embedding is not None else None
            if embed_handler_fn is None:
                response_futures.append(
                    make_async_error_request_output(
@@ -394,8 +394,8 @@ async def main(args):
                run_request(embed_handler_fn, request, tracker))
            tracker.submitted()
        elif request.url == "/v1/score":
-            score_handler_fn = (None if openai_serving_scores is None else
-                                openai_serving_scores.create_score)
+            score_handler_fn = openai_serving_scores.create_score if \
+                openai_serving_scores is not None else None
            if score_handler_fn is None:
                response_futures.append(
                    make_async_error_request_output(

--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -2,7 +2,6 @@

 import asyncio
 import json
-import re
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
@@ -10,6 +9,7 @@ from typing import Callable, Final, Optional, Union

 import jinja2
 import partial_json_parser
+import regex as re
 from fastapi import Request
 from pydantic import TypeAdapter


--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -582,7 +582,8 @@ class OpenAIServing:
        add_special_tokens: bool = True,
    ) -> TextTokensPrompt:
        """
-        A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
+        A simpler implementation of
+        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
        that assumes single input.
        """
        return next(
@@ -603,7 +604,8 @@ class OpenAIServing:
        add_special_tokens: bool = True,
    ) -> Iterator[TextTokensPrompt]:
        """
-        A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
+        A simpler implementation of
+        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
        that assumes multiple inputs.
        """
        for text in prompt_inputs:

--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -7,6 +7,7 @@ from .granite_tool_parser import GraniteToolParser
 from .hermes_tool_parser import Hermes2ProToolParser
 from .internlm2_tool_parser import Internlm2ToolParser
 from .jamba_tool_parser import JambaToolParser
+from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
@@ -16,5 +17,6 @@ __all__ = [
    "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
    "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
    "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
-    "PythonicToolParser", "Phi4MiniJsonToolParser", "DeepSeekV3ToolParser"
+    "Llama4PythonicToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser",
+    "DeepSeekV3ToolParser"
 ]
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
 # SPDX-License-Identifier: Apache-2.0

-import re
 from collections.abc import Sequence
 from typing import Union

+import regex as re
+
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,

--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
 # SPDX-License-Identifier: Apache-2.0

 import json
-import re
 from collections.abc import Sequence
 from json import JSONDecoder
 from typing import Union

 import partial_json_parser
+import regex as re
 from partial_json_parser.core.options import Allow

 from vllm.entrypoints.chat_utils import random_tool_call_id
@@ -80,7 +80,8 @@ class Granite20bFCToolParser(ToolParser):
                    function=FunctionCall(
                        name=function_call["name"],
                        # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"]),
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
                    ),
                ) for function_call in raw_function_calls
            ]
@@ -166,7 +167,8 @@ class Granite20bFCToolParser(ToolParser):
                if self.current_tool_id >= 0:
                    cur_arguments = current_tool_call.get("arguments")
                    if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                        sent = len(
                            self.streamed_args_for_tool[self.current_tool_id])
                        argument_diff = cur_args_json[sent:]
@@ -218,7 +220,8 @@ class Granite20bFCToolParser(ToolParser):
                if cur_arguments:
                    sent = len(
                        self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                    prev_arguments = self.prev_tool_call_arr[
                        self.current_tool_id].get("arguments")

@@ -226,7 +229,8 @@ class Granite20bFCToolParser(ToolParser):
                    if is_complete[self.current_tool_id]:
                        argument_diff = cur_args_json[sent:]
                    elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                        if cur_args_json != prev_args_json:

                            prefix = find_common_prefix(

--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -67,7 +67,8 @@ class GraniteToolParser(ToolParser):
                    function=FunctionCall(
                        name=function_call["name"],
                        # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"]),
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
                    ),
                ) for function_call in raw_function_calls
            ]
@@ -151,7 +152,8 @@ class GraniteToolParser(ToolParser):
                if self.current_tool_id >= 0:
                    cur_arguments = current_tool_call.get("arguments")
                    if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                        sent = len(
                            self.streamed_args_for_tool[self.current_tool_id])
                        argument_diff = cur_args_json[sent:]
@@ -197,7 +199,8 @@ class GraniteToolParser(ToolParser):
                if cur_arguments:
                    sent = len(
                        self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                    prev_arguments = self.prev_tool_call_arr[
                        self.current_tool_id].get("arguments")

@@ -205,7 +208,8 @@ class GraniteToolParser(ToolParser):
                    if is_complete[self.current_tool_id]:
                        argument_diff = cur_args_json[sent:]
                    elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                        if cur_args_json != prev_args_json:
                            prefix = find_common_prefix(
                                prev_args_json, cur_args_json)

--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
 # SPDX-License-Identifier: Apache-2.0

 import json
-import re
 from collections.abc import Sequence
 from typing import Union

 import partial_json_parser
+import regex as re
 from partial_json_parser.core.options import Allow

 from vllm.entrypoints.chat_utils import random_tool_call_id

--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -133,7 +133,8 @@ class Internlm2ToolParser(ToolParser):
                    delta = None
                # first time to get parameters
                elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)

                    arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                         index(delta_text) +
@@ -148,8 +149,10 @@ class Internlm2ToolParser(ToolParser):
                        self.current_tool_id] += arguments_delta
                # both prev and cur parameters, send the increase parameters
                elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)

                    argument_diff = extract_intermediate_diff(
                        cur_args_json, prev_args_json)
@@ -190,7 +193,8 @@ class Internlm2ToolParser(ToolParser):
            action_dict = json.loads(action)
            name, parameters = action_dict['name'], json.dumps(
                action_dict.get('parameters', action_dict.get('arguments',
-                                                              {})))
+                                                              {})),
+                ensure_ascii=False)

            if not tools or name not in [t.function.name for t in tools]:
                ExtractedToolCallInformation(tools_called=False,