Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/examples/pooling/score/template/nemotron-rerank.jinja
+++ b/examples/pooling/score/template/nemotron-rerank.jinja
+question:{{ (messages | selectattr("role", "eq", "query") | first).content }} 
+ 
+ passage:{{ (messages | selectattr("role", "eq", "document") | first).content }}
\ No newline at end of file
--- a/examples/pooling/score/template/qwen3_reranker.jinja
+++ b/examples/pooling/score/template/qwen3_reranker.jinja
+<|im_start|>system
+Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
+<|im_start|>user
+<Instruct>: {{ messages | selectattr("role", "eq", "system") | map(attribute="content") | first | default("Given a web search query, retrieve relevant passages that answer the query") }}
+<Query>: {{ messages | selectattr("role", "eq", "query") | map(attribute="content") | first }}
+<Document>: {{ messages | selectattr("role", "eq", "document") | map(attribute="content") | first }}<|im_end|>
+<|im_start|>assistant
+<think>
+
+</think>
+
--- a/examples/pooling/score/template/qwen3_vl_reranker.jinja
+++ b/examples/pooling/score/template/qwen3_vl_reranker.jinja
+<|im_start|>system
+Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
+<|im_start|>user
+<Instruct>: {{
+    messages
+    | selectattr("role", "eq", "system")
+    | map(attribute="content")
+    | first
+    | default("Given a search query, retrieve relevant candidates that answer the query.")
+}}<Query>:{{
+    messages
+    | selectattr("role", "eq", "query")
+    | map(attribute="content")
+    | first
+}}
+<Document>:{{
+    messages
+    | selectattr("role", "eq", "document")
+    | map(attribute="content")
+    | first
+}}<|im_end|>
+<|im_start|>assistant
+
--- a/examples/pooling/score/using_template_offline.py
+++ b/examples/pooling/score/using_template_offline.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+from argparse import Namespace
+from pathlib import Path
+from typing import Any
+
+from vllm import LLM, EngineArgs
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+def parse_args():
+    """Parse command line arguments for the reranking example.
+
+    This function sets up the argument parser with default values
+    specific to reranking models, including the model name and
+    runner type.
+    """
+    parser = FlexibleArgumentParser()
+    # Add all EngineArgs command line arguments to the parser
+    parser = EngineArgs.add_cli_args(parser)
+
+    # Set default values specific to this reranking example
+    # These defaults ensure the script works out-of-the-box for reranking tasks
+    parser.set_defaults(
+        model="nvidia/llama-nemotron-rerank-1b-v2",  # Default reranking model
+        runner="pooling",  # Required for cross-encoder/reranking models
+        trust_remote_code=True,  # Allow loading models with custom code
+    )
+    return parser.parse_args()
+
+
+def get_chat_template(model: str) -> str:
+    """Load the appropriate chat template for the specified model.
+
+    Reranking models require specific prompt templates to format
+    query-document pairs correctly. This function maps model names
+    to their corresponding template files.
+    """
+    # Directory containing all chat template files
+    template_home = Path(__file__).parent / "template"
+
+    # Mapping from model names to their corresponding template files
+    # Each reranking model has its own specific prompt format
+    model_name_to_template_path_map = {
+        "BAAI/bge-reranker-v2-gemma": "bge-reranker-v2-gemma.jinja",
+        "Qwen/Qwen3-Reranker-0.6B": "qwen3_reranker.jinja",
+        "Qwen/Qwen3-Reranker-4B": "qwen3_reranker.jinja",
+        "Qwen/Qwen3-Reranker-8B": "qwen3_reranker.jinja",
+        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls": "qwen3_reranker.jinja",
+        "tomaarsen/Qwen3-Reranker-4B-seq-cls": "qwen3_reranker.jinja",
+        "tomaarsen/Qwen3-Reranker-8B-seq-cls": "qwen3_reranker.jinja",
+        "mixedbread-ai/mxbai-rerank-base-v2": "mxbai_rerank_v2.jinja",
+        "mixedbread-ai/mxbai-rerank-large-v2": "mxbai_rerank_v2.jinja",
+        "nvidia/llama-nemotron-rerank-1b-v2": "nemotron-rerank.jinja",
+    }
+
+    # Get the template filename for the specified model
+    template_path = model_name_to_template_path_map.get(model)
+
+    if template_path is None:
+        raise ValueError(f"This demo does not support model name: {model}.")
+
+    # Read and return the template content
+    return (template_home / template_path).read_text()
+
+
+def get_hf_overrides(model: str) -> dict[str, Any]:
+    """Convert Large Language Models (LLMs) to Sequence Classification models.
+
+    note:
+        Some reranking models require special configuration overrides to work
+        correctly with vLLM's score API.
+        Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/qwen3_reranker_offline.py
+        Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
+    """
+
+    model_name_to_hf_overrides_map = {
+        "BAAI/bge-reranker-v2-gemma": {
+            "architectures": ["GemmaForSequenceClassification"],
+            "classifier_from_token": ["Yes"],
+            "method": "no_post_processing",
+        },
+        "Qwen/Qwen3-Reranker-0.6B": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+        "Qwen/Qwen3-Reranker-4B": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+        "Qwen/Qwen3-Reranker-8B": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls": {},
+        "tomaarsen/Qwen3-Reranker-4B-seq-cls": {},
+        "tomaarsen/Qwen3-Reranker-8B-seq-cls": {},
+        "mixedbread-ai/mxbai-rerank-base-v2": {
+            "architectures": ["Qwen2ForSequenceClassification"],
+            "classifier_from_token": ["0", "1"],
+            "method": "from_2_way_softmax",
+        },
+        "mixedbread-ai/mxbai-rerank-large-v2": {
+            "architectures": ["Qwen2ForSequenceClassification"],
+            "classifier_from_token": ["0", "1"],
+            "method": "from_2_way_softmax",
+        },
+        "nvidia/llama-nemotron-rerank-1b-v2": {},
+    }
+
+    hf_overrides = model_name_to_hf_overrides_map.get(model)
+
+    if hf_overrides is None:
+        raise ValueError(f"This demo does not support model name: {model}.")
+
+    return hf_overrides
+
+
+def main(args: Namespace):
+    """Main execution function for the reranking example."""
+
+    # Get the overrides for the specified model
+    args.hf_overrides = get_hf_overrides(args.model)
+
+    # Initialize the LLM with all provided arguments
+    llm = LLM(**vars(args))
+
+    # Example query for demonstration
+    query = "how much protein should a female eat?"
+
+    # Example documents to be reranked based on relevance to the query
+    documents = [
+        "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
+        "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
+        "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
+    ]
+
+    # Load the appropriate chat template for the selected model
+    # The template formats query-document pairs for the reranking model
+    chat_template = get_chat_template(args.model)
+
+    # Score documents based on relevance to the query
+    # The score method returns relevance scores for each document
+    outputs = llm.score(query, documents, chat_template=chat_template)
+
+    # Display the relevance scores
+    # Higher scores indicate more relevant documents
+    print("-" * 30)
+    print([output.outputs.score for output in outputs])
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/pooling/score/using_template_online.py
+++ b/examples/pooling/score/using_template_online.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Example of using the rerank API with template.
+
+This script demonstrates how to interact with a vLLM server running
+a reranking model via the REST API.
+Before running this script, start the vLLM server with one of the
+supported reranking models using the commands below.
+
+note:
+    Some reranking models require special configuration overrides to work correctly
+    with vLLM's score API.
+    Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/qwen3_reranker_online.py
+    Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
+
+run:
+    vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}' --chat-template examples/pooling/score/template/bge-reranker-v2-gemma.jinja
+    vllm serve tomaarsen/Qwen3-Reranker-0.6B-seq-cls --chat-template examples/pooling/score/template/qwen3_reranker.jinja
+    vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}' --chat-template examples/pooling/score/template/mxbai_rerank_v2.jinja
+    vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja
+    vllm serve Qwen/Qwen3-Reranker-0.6B --runner pooling --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' --chat-template examples/pooling/score/template/qwen3_reranker.jinja
+"""
+
+import json
+
+import requests
+
+# URL of the vLLM server's rerank endpoint
+# Default vLLM server runs on localhost port 8000
+url = "http://127.0.0.1:8000/rerank"
+
+# HTTP headers for the request
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+# Example query & documents
+query = "how much protein should a female eat?"
+documents = [
+    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
+    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
+    "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
+]
+
+# Request payload for the rerank API
+data = {
+    "model": "nvidia/llama-nemotron-rerank-1b-v2",  # Model to use for reranking
+    "query": query,  # The query to score documents against
+    "documents": documents,  # List of documents to be scored
+}
+
+
+def main():
+    """Main function to send a rerank request to the vLLM server.
+
+    This function sends a POST request to the /rerank endpoint with
+    the query and documents, then prints the relevance scores.
+    """
+    # Send POST request to the vLLM server's rerank endpoint
+    response = requests.post(url, headers=headers, json=data)
+
+    # Check if the request was successful
+    if response.status_code == 200:
+        print("Request successful!")
+        # Pretty print the JSON response containing relevance scores
+        # The response includes scores for each document's relevance to the query
+        print(json.dumps(response.json(), indent=2))
+    else:
+        # Handle request failure
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pooling/score/vision_rerank_api_online.py
+++ b/examples/pooling/score/vision_rerank_api_online.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+"""
+Example Python client for multimodal rerank API which is compatible with
+Jina and Cohere https://jina.ai/reranker
+
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
+e.g.
+    vllm serve jinaai/jina-reranker-m0 --runner pooling
+
+    vllm serve Qwen/Qwen3-VL-Reranker-2B \
+        --runner pooling \
+        --max-model-len 4096 \
+        --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' \
+        --chat-template examples/pooling/score/template/qwen3_vl_reranker.jinja
+"""
+
+import argparse
+import json
+
+import requests
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+query = "A woman playing with her dog on a beach at sunset."
+documents = {
+    "content": [
+        {
+            "type": "text",
+            "text": (
+                "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "  # noqa: E501
+                "as the dog offers its paw in a heartwarming display of companionship and trust."  # noqa: E501
+            ),
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+            },
+        },
+    ]
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    return parser.parse_args()
+
+
+def main(args):
+    base_url = f"http://{args.host}:{args.port}"
+    models_url = base_url + "/v1/models"
+    rerank_url = base_url + "/rerank"
+
+    response = requests.get(models_url, headers=headers)
+    model = response.json()["data"][0]["id"]
+
+    data = {
+        "model": model,
+        "query": query,
+        "documents": documents,
+    }
+    response = requests.post(rerank_url, headers=headers, json=data)
+
+    # Check the response
+    if response.status_code == 200:
+        print("Request successful!")
+        print(json.dumps(response.json(), indent=2))
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/pooling/score/vision_reranker_offline.py
+++ b/examples/pooling/score/vision_reranker_offline.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+vision language reranker models for multimodal scoring tasks.
+
+Vision language rerankers score the relevance between a text query and
+multimodal documents (text + images/videos).
+"""
+
+from argparse import Namespace
+from collections.abc import Callable
+from dataclasses import asdict
+from pathlib import Path
+from typing import NamedTuple
+
+from vllm import LLM, EngineArgs
+from vllm.entrypoints.score_utils import ScoreMultiModalParam
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+TEMPLATE_HOME = Path(__file__).parent / "template"
+
+
+class RerankModelData(NamedTuple):
+    engine_args: EngineArgs
+    chat_template: str | None = None
+
+
+def run_jinavl_reranker(modality: str) -> RerankModelData:
+    assert modality == "image"
+
+    engine_args = EngineArgs(
+        model="jinaai/jina-reranker-m0",
+        runner="pooling",
+        max_model_len=32768,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 602112,
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+    return RerankModelData(
+        engine_args=engine_args,
+    )
+
+
+def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-VL-Reranker-2B",
+        runner="pooling",
+        max_model_len=16384,
+        limit_mm_per_prompt={modality: 1},
+        # HuggingFace model configuration overrides required for compatibility
+        hf_overrides={
+            # Manually route to sequence classification architecture
+            # This tells vLLM to use Qwen3VLForSequenceClassification instead of
+            # the default Qwen3VLForConditionalGeneration
+            "architectures": ["Qwen3VLForSequenceClassification"],
+            # Specify which token logits to extract from the language model head
+            # The original reranker uses "no" and "yes" token logits for scoring
+            "classifier_from_token": ["no", "yes"],
+            # Enable special handling for original Qwen3-Reranker models
+            # This flag triggers conversion logic that transforms the two token
+            # vectors into a single classification vector
+            "is_original_qwen3_reranker": True,
+        },
+    )
+    chat_template_path = "qwen3_vl_reranker.jinja"
+    chat_template = (TEMPLATE_HOME / chat_template_path).read_text()
+    return RerankModelData(
+        engine_args=engine_args,
+        chat_template=chat_template,
+    )
+
+
+model_example_map: dict[str, Callable[[str], RerankModelData]] = {
+    "jinavl_reranker": run_jinavl_reranker,
+    "qwen3_vl_reranker": run_qwen3_vl_reranker,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language reranker models for multimodal scoring tasks."
+    )
+    parser.add_argument(
+        "--model-name",
+        "-m",
+        type=str,
+        default="jinavl_reranker",
+        choices=model_example_map.keys(),
+        help="The name of the reranker model.",
+    )
+    parser.add_argument(
+        "--modality",
+        type=str,
+        default="image",
+        choices=["image", "video"],
+        help="Modality of the multimodal input (image or video).",
+    )
+    return parser.parse_args()
+
+
+def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
+    # Sample query for testing the reranker
+    if modality == "image":
+        query = "A woman playing with her dog on a beach at sunset."
+        # Sample multimodal documents to be scored against the query
+        # Each document contains an image URL that will be fetched and processed
+        documents: ScoreMultiModalParam = {
+            "content": [
+                {
+                    "type": "text",
+                    "text": (
+                        "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "  # noqa: E501
+                        "as the dog offers its paw in a heartwarming display of companionship and trust."  # noqa: E501
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+                    },
+                },
+            ]
+        }
+    elif modality == "video":
+        query = "A girl is drawing pictures on an ipad."
+        # Sample video documents to be scored against the query
+        documents: ScoreMultiModalParam = {
+            "content": [
+                {
+                    "type": "text",
+                    "text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
+                    },
+                },
+            ]
+        }
+    else:
+        raise ValueError(f"Unsupported modality: {modality}")
+    return query, documents
+
+
+def main(args: Namespace):
+    # Run the selected reranker model
+    modality = args.modality
+    model_request = model_example_map[args.model_name](modality)
+    engine_args = model_request.engine_args
+
+    llm = LLM(**asdict(engine_args))
+
+    query, documents = get_multi_modal_input(modality)
+    outputs = llm.score(query, documents, chat_template=model_request.chat_template)
+
+    print("-" * 50)
+    print(f"Model: {engine_args.model}")
+    print(f"Modality: {modality}")
+    print(f"Query: {query}")
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
+++ b/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
 """
 Example online usage of Score API.

 Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
+e.g.
+    vllm serve jinaai/jina-reranker-m0 --runner pooling
+
+    vllm serve Qwen/Qwen3-VL-Reranker-2B \
+        --runner pooling \
+        --max-model-len 4096 \
+        --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' \
+        --chat-template examples/pooling/score/template/qwen3_vl_reranker.jinja
 """

 import argparse
+import json
 import pprint

 import requests

+headers = {"accept": "application/json", "Content-Type": "application/json"}

-def post_http_request(prompt: dict, api_url: str) -> requests.Response:
-    headers = {"User-Agent": "Test Client"}
-    response = requests.post(api_url, headers=headers, json=prompt)
-    return response
+text_1 = "slm markdown"
+text_2 = {
+    "content": [
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+            },
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+            },
+        },
+    ]
+}


 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--model", type=str, default="jinaai/jina-reranker-m0")
    return parser.parse_args()


 def main(args):
-    api_url = f"http://{args.host}:{args.port}/score"
-    model_name = args.model
-
-    text_1 = "slm markdown"
-    text_2 = {
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-                },
-            },
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-                },
-            },
-        ]
-    }
-    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
-    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    base_url = f"http://{args.host}:{args.port}"
+    models_url = base_url + "/v1/models"
+    score_url = base_url + "/score"
+
+    response = requests.get(models_url, headers=headers)
+    model = response.json()["data"][0]["id"]
+
+    prompt = {"model": model, "text_1": text_1, "text_2": text_2}
+    response = requests.post(score_url, headers=headers, json=prompt)
    print("\nPrompt when text_1 is string and text_2 is a image list:")
    pprint.pprint(prompt)
    print("\nScore Response:")
-    pprint.pprint(score_response.json())
+    print(json.dumps(response.json(), indent=2))


 if __name__ == "__main__":

--- a/examples/tool_chat_template_functiongemma.jinja
+++ b/examples/tool_chat_template_functiongemma.jinja
+{%- set ns = namespace(developer_content='', has_tools=false) -%}
+
+{%- if tools is defined and tools | length > 0 -%}
+    {%- set ns.has_tools = true -%}
+{%- endif -%}
+
+{%- for message in messages -%}
+    {%- if message.role == 'developer' or message.role == 'system' -%}
+<start_of_turn>user
+{{ message.content }}
+{%- if ns.has_tools %}
+
+Available functions:
+{%- for tool in tools %}
+{%- if tool.type == 'function' %}
+
+Function: {{ tool.function.name }}
+Description: {{ tool.function.description | default('No description provided') }}
+Parameters: {{ tool.function.parameters | tojson }}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+<end_of_turn>
+    {%- elif message.role == 'user' -%}
+<start_of_turn>user
+{{ message.content }}<end_of_turn>
+    {%- elif message.role == 'assistant' -%}
+        {%- if message.tool_calls is defined and message.tool_calls | length > 0 -%}
+<start_of_turn>model
+{%- for tool_call in message.tool_calls %}
+<start_function_call>call:{{ tool_call.function.name }}{
+{%- set args = tool_call.function.arguments -%}
+{%- if args is string -%}
+{%- set args = args | fromjson -%}
+{%- endif -%}
+{%- for key, value in args.items() -%}
+{{ key }}:<escape>{{ value }}<escape>{% if not loop.last %},{% endif %}
+{%- endfor -%}
+}<end_function_call>
+{%- endfor %}
+<end_of_turn>
+        {%- else -%}
+<start_of_turn>model
+{{ message.content }}<end_of_turn>
+        {%- endif -%}
+    {%- elif message.role == 'tool' -%}
+<start_of_turn>user
+Function result for {{ message.name | default('function') }}: {{ message.content }}<end_of_turn>
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+<start_of_turn>model
+{%- endif -%}
--- a/examples/tool_chat_template_glm4.jinja
+++ b/examples/tool_chat_template_glm4.jinja
+{%- set counter = namespace(index=0) -%}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{%- if messages and messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant." %}
+{%- endif %}
+
+{%- if tools is not none %}
+    {%- set tool_instruction %}
+You have access to the following tools. When you need to call a tool, you MUST use the following format:
+
+<tool_call>function_name
+<arg_key>parameter_name</arg_key>
+<arg_value>parameter_value</arg_value>
+</tool_call>
+
+Important rules:
+- Always wrap tool calls with <tool_call>...</tool_call> tags
+- Put the function name on the first line after <tool_call>
+- Use <arg_key> and <arg_value> tags for each parameter
+- If a parameter value is a string, keep it as-is. If it's a number or boolean, convert it appropriately
+- You can make multiple tool calls if needed
+- If no tool is suitable, respond with regular text
+
+Available tools:
+{% endset %}
+    {{- tool_instruction + "\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
+        {%- set counter.index = counter.index + 1 -%}
+    {%- endif -%}
+    {%- if message['role'] == 'assistant' -%}
+        {{- '\n答：' + message['content'] -}}
+        {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+            {{- '\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '\n答：' -}}
+{%- endif -%}
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -80,6 +80,7 @@ plugins:
        - "re:vllm\\._.*"  # Internal modules
        - "vllm.third_party"
        - "vllm.vllm_flash_attn"
+        - "re:vllm\\.grpc\\..*_pb2.*"  # Auto-generated protobuf files
        - !ENV [API_AUTONAV_EXCLUDE, "re:^$"]  # Match nothing by default
  - mkdocstrings:
      handlers:
@@ -87,7 +88,8 @@ plugins:
          options:
            show_symbol_type_heading: true
            show_symbol_type_toc: true
-            filters: []
+            filters:
+              - "!.*_pb2_grpc"  # Exclude auto-generated gRPC stubs
            summary:
              modules: true
            show_if_no_docstring: true

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,9 +6,10 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<81.0.0",
    "setuptools-scm>=8.0",
-    "torch >= 2.7.1",
+    "torch == 2.9.0",
    "wheel",
    "jinja2",
+    "grpcio-tools>=1.76.0",
 ]
 build-backend = "setuptools.build_meta"

@@ -55,6 +56,10 @@ include = ["vllm*"]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
+# Exclude generated protobuf files
+"vllm/grpc/*_pb2.py" = ["ALL"]
+"vllm/grpc/*_pb2_grpc.py" = ["ALL"]
+"vllm/grpc/*_pb2.pyi" = ["ALL"]

 [tool.ruff.lint]
 select = [
@@ -120,7 +125,7 @@ python = "./.venv"
 # these files may be written in non english words
 extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
    "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
-    "vllm/third_party/*"]
+    "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*"]
 ignore-hidden = true
 ignore-files = true
 ignore-dot = true
@@ -162,6 +167,7 @@ depthwise_seperable_CNN = "depthwise_seperable_CNN"
 [tool.typos.default.extend-words]
 iy = "iy"
 tendencias = "tendencias"
+indx = "indx"
 # intel cpu features
 tme = "tme"
 dout = "dout"
@@ -302,4 +308,4 @@ windo = "windo"
 [tool.typos.type.vimscript.extend-words]

 [tool.uv]
-no-build-isolation-package = ["torch"]
+no-build-isolation-package = ["torch"]
\ No newline at end of file
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,8 +4,10 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.9.0
+torch==2.9.1
 wheel
 jinja2>=3.1.6
 regex
 build
+protobuf>=6.33.2
+grpcio-tools>=1.76.0
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -9,7 +9,7 @@ blake3
 py-cpuinfo
 transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
-protobuf # Required by LlamaTokenizer.
+protobuf >= 6.30.0 # Required by LlamaTokenizer, gRPC.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
 openai >= 1.99.1  # For Responses API with reasoning content
@@ -24,25 +24,24 @@ outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
+xgrammar == 0.1.29; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.17.0
-mistral_common[image] >= 1.8.5
+mistral_common[image] >= 1.8.8
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.12.2 # required for compressed-tensors
+compressed-tensors == 0.13.0 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/others/logging_configuration.md
-scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
@@ -50,5 +49,7 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards >= 0.1.9, < 1.0.0
-mcp
\ No newline at end of file
+model-hosting-container-standards >= 0.1.10, < 1.0.0
+mcp
+grpcio>=1.76.0
+grpcio-reflection>=1.76.0
\ No newline at end of file
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools==77.0.3 # this version can reuse CMake build dir
 setuptools-scm>=8
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"

--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
 # Common dependencies
 -r common.txt

+setuptools==77.0.3 # this version can reuse CMake build dir
+
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding

 # Dependencies for CPUs

--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding

 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.9.0
-torchaudio==2.9.0
+torch==2.9.1
+torchaudio==2.9.1
 # These must be updated alongside torch
-torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.3
--- a/requirements/kv_connectors_rocm.txt
+++ b/requirements/kv_connectors_rocm.txt
+tblib
+lm_eval[api]
\ No newline at end of file
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -17,17 +17,17 @@ vocos # required for minicpmo_26 test
 peft
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
-sentence-transformers # required for embedding tests
+sentence-transformers>=5.2.0 # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.5 # required for voxtral test
+mistral_common[image,audio] >= 1.8.8 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
+lm-eval[api]>=0.4.9.2 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.57.3
 tokenizers==0.22.0

--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -2,11 +2,11 @@
 -r common.txt

 --extra-index-url https://download.pytorch.org/whl/rocm6.4
-torch==2.9.0
-torchvision==0.24.0
-torchaudio==2.9.0
+torch==2.9.1
+torchvision==0.24.1
+torchaudio==2.9.1

-triton==3.5.0
+triton==3.5.1
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0