Merge tag 'v0.11.2' into v0.11.2-ori

006693ed · zhuwenwen · 4b51e6f1 · 275de341 · 006693ed · 006693ed
Commit 006693ed authored Dec 01, 2025 by zhuwenwen
20 changed files
--- a/examples/online_serving/pooling/embedding_requests_base64_client.py
+++ b/examples/online_serving/pooling/embedding_requests_base64_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for embedding API using vLLM API server
+NOTE:
+    start a supported embeddings model server with `vllm serve`, e.g.
+    vllm serve intfloat/e5-small
+"""
+import argparse
+import base64
+import requests
+import torch
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    binary2tensor,
+)
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="intfloat/e5-small")
+    return parser.parse_args()
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/v1/embeddings"
+    model_name = args.model
+    # The OpenAI client does not support the embed_dtype and endianness parameters.
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            prompt = {
+                "model": model_name,
+                "input": "vLLM is great!",
+                "encoding_format": "base64",
+                "embed_dtype": embed_dtype,
+                "endianness": endianness,
+            }
+            response = post_http_request(prompt=prompt, api_url=api_url)
+            embedding = []
+            for data in response.json()["data"]:
+                binary = base64.b64decode(data["embedding"])
+                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
+                embedding.append(tensor.to(torch.float32))
+            embedding = torch.cat(embedding)
+            print(embed_dtype, endianness, embedding.shape)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/pooling/embedding_requests_bytes_client.py
+++ b/examples/online_serving/pooling/embedding_requests_bytes_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for embedding API using vLLM API server
+NOTE:
+    start a supported embeddings model server with `vllm serve`, e.g.
+    vllm serve intfloat/e5-small
+"""
+import argparse
+import json
+import requests
+import torch
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    MetadataItem,
+    decode_pooling_output,
+)
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="intfloat/e5-small")
+    return parser.parse_args()
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/v1/embeddings"
+    model_name = args.model
+    # The OpenAI client does not support the bytes encoding_format.
+    # The OpenAI client does not support the embed_dtype and endianness parameters.
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            prompt = {
+                "model": model_name,
+                "input": "vLLM is great!",
+                "encoding_format": "bytes",
+                "embed_dtype": embed_dtype,
+                "endianness": endianness,
+            }
+            response = post_http_request(prompt=prompt, api_url=api_url)
+            metadata = json.loads(response.headers["metadata"])
+            body = response.content
+            items = [MetadataItem(**x) for x in metadata["data"]]
+            embedding = decode_pooling_output(items=items, body=body)
+            embedding = [x.to(torch.float32) for x in embedding]
+            embedding = torch.cat(embedding)
+            print(embed_dtype, endianness, embedding.shape)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/pooling/multi_vector_retrieval_client.py
+++ b/examples/online_serving/pooling/multi_vector_retrieval_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example online usage of Pooling API for multi vector retrieval.
+Run `vllm serve <model> --runner pooling`
+to start up the server in vLLM. e.g.
+vllm serve BAAI/bge-m3
+"""
+import argparse
+import requests
+import torch
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="BAAI/bge-m3")
+    return parser.parse_args()
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/pooling"
+    model_name = args.model
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    prompt = {"model": model_name, "input": prompts}
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    for output in pooling_response.json()["data"]:
+        multi_vector = torch.tensor(output["data"])
+        print(multi_vector.shape)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/pooling/ner.py
+++ b/examples/online_serving/pooling/ner.py
--- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
-"""Example Python client for multimodal embedding API using vLLM API server
+"""Example Python client for multimodal embedding API using vLLM API server.
-NOTE:
-    start a supported multimodal embeddings model server with `vllm serve`, e.g.
+Refer to each `run_*` function for the command to run the server for that model.
-    vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
 """
 import argparse
 import base64
 import io
+from typing import Literal
-import requests
+from openai import OpenAI
+from openai._types import NOT_GIVEN, NotGiven
+from openai.types.chat import ChatCompletionMessageParam
+from openai.types.create_embedding_response import CreateEmbeddingResponse
 from PIL import Image
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-def vlm2vec():
+def create_chat_embeddings(
-    response = requests.post(
+    client: OpenAI,
-        "http://localhost:8000/v1/embeddings",
+    *,
-        json={
+    messages: list[ChatCompletionMessageParam],
-            "model": "TIGER-Lab/VLM2Vec-Full",
+    model: str,
-            "messages": [
+    encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
-                {
+) -> CreateEmbeddingResponse:
-                    "role": "user",
+    """
-                    "content": [
+    Convenience function for accessing vLLM's Chat Embeddings API,
-                        {"type": "image_url", "image_url": {"url": image_url}},
+    which is an extension of OpenAI's existing Embeddings API.
-                        {"type": "text", "text": "Represent the given image."},
+    """
-                    ],
+    return client.post(
-                }
+        "/embeddings",
-            ],
+        cast_to=CreateEmbeddingResponse,
-            "encoding_format": "float",
+        body={"messages": messages, "model": model, "encoding_format": encoding_format},
-        },
    )
-    response.raise_for_status()
-    response_json = response.json()
-    print("Embedding output:", response_json["data"][0]["embedding"])
+def run_clip(client: OpenAI, model: str):
+    """
+    Start the server using:
+    vllm serve openai/clip-vit-base-patch32 \
+        --runner pooling
+    """
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+    print("Image embedding output:", response.data[0].embedding)
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "a photo of a cat"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+    print("Text embedding output:", response.data[0].embedding)
+def run_dse_qwen2_vl(client: OpenAI, model: str):
+    """
+    Start the server using:
-def dse_qwen2_vl(inp: dict):
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
-    # Embedding an Image
+        --runner pooling \
-    if inp["type"] == "image":
+        --trust-remote-code \
-        messages = [
+        --max-model-len 8192 \
+        --chat-template examples/template_dse_qwen2_vl.jinja
+    """
+    response = create_chat_embeddings(
+        client,
+        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
-                            "url": inp["image_url"],
+                            "url": image_url,
                        },
                    },
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            }
-        ]
+        ],
-    # Embedding a Text Query
+        model=model,
-    else:
+        encoding_format="float",
-        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
+    )
-        # of the minimum input size
-        buffer = io.BytesIO()
+    print("Image embedding output:", response.data[0].embedding)
-        image_placeholder = Image.new("RGB", (56, 56))
-        image_placeholder.save(buffer, "png")
+    # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
-        buffer.seek(0)
+    # of the minimum input size
-        image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
+    buffer = io.BytesIO()
-        messages = [
+    image_placeholder = Image.new("RGB", (56, 56))
+    image_placeholder.save(buffer, "png")
+    buffer.seek(0)
+    image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
+    response = create_chat_embeddings(
+        client,
+        messages=[
            {
                "role": "user",
                "content": [
@@ -76,23 +134,129 @@ def dse_qwen2_vl(inp: dict):
                            "url": f"data:image/jpeg;base64,{image_placeholder}",
                        },
                    },
-                    {"type": "text", "text": f"Query: {inp['content']}"},
+                    {"type": "text", "text": "Query: What is the weather like today?"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+    print("Text embedding output:", response.data[0].embedding)
+def run_siglip(client: OpenAI, model: str):
+    """
+    Start the server using:
+    vllm serve google/siglip-base-patch16-224 \
+        --runner pooling
+    """
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
                ],
            }
-        ]
+        ],
+        model=model,
-    response = requests.post(
+        encoding_format="float",
-        "http://localhost:8000/v1/embeddings",
-        json={
-            "model": "MrLight/dse-qwen2-2b-mrl-v1",
-            "messages": messages,
-            "encoding_format": "float",
-        },
    )
-    response.raise_for_status()
-    response_json = response.json()
-    print("Embedding output:", response_json["data"][0]["embedding"])
+    print("Image embedding output:", response.data[0].embedding)
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "a photo of a cat"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+    print("Text embedding output:", response.data[0].embedding)
+def run_vlm2vec(client: OpenAI, model: str):
+    """
+    Start the server using:
+    vllm serve TIGER-Lab/VLM2Vec-Full \
+        --runner pooling \
+        --trust-remote-code \
+        --max-model-len 4096 \
+        --chat-template examples/template_vlm2vec_phi3v.jinja
+    """
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Represent the given image."},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+    print("Image embedding output:", response.data[0].embedding)
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "Represent the given image with the following question: What is in the image.",
+                    },
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+    print("Image+Text embedding output:", response.data[0].embedding)
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "A cat and a dog"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+    print("Text embedding output:", response.data[0].embedding)
+model_example_map = {
+    "clip": run_clip,
+    "dse_qwen2_vl": run_dse_qwen2_vl,
+    "siglip": run_siglip,
+    "vlm2vec": run_vlm2vec,
+}
 def parse_args():
@@ -103,29 +267,24 @@ def parse_args():
    parser.add_argument(
        "--model",
        type=str,
-        choices=["vlm2vec", "dse_qwen2_vl"],
+        choices=model_example_map.keys(),
        required=True,
-        help="Which model to call.",
+        help="The name of the embedding model.",
    )
    return parser.parse_args()
 def main(args):
-    if args.model == "vlm2vec":
+    client = OpenAI(
-        vlm2vec()
+        # defaults to os.environ.get("OPENAI_API_KEY")
-    elif args.model == "dse_qwen2_vl":
+        api_key=openai_api_key,
-        dse_qwen2_vl(
+        base_url=openai_api_base,
-            {
+    )
-                "type": "image",
-                "image_url": image_url,
+    models = client.models.list()
-            }
+    model_id = models.data[0].id
-        )
-        dse_qwen2_vl(
+    model_example_map[args.model](client, model_id)
-            {
-                "type": "text",
-                "content": "What is the weather like today?",
-            }
-        )
 if __name__ == "__main__":

--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
--- a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
+++ b/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -11,14 +11,15 @@ import requests
 # image as input, process it using the multimodal data processor, and
 # perform inference.
 # Requirements :
-# - install plugin at:
+# - install TerraTorch v1.1 (or later):
-#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+#   pip install terratorch>=v1.1
 # - start vllm in serving mode with the below args
 #   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
 #   --model-impl terratorch
 #   --task embed --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
-#   --io-processor-plugin prithvi_to_tiff
+#   --io-processor-plugin terratorch_segmentation
+#   --enable-mm-embeds
 def main():
@@ -34,7 +35,6 @@ def main():
        },
        "priority": 0,
        "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
-        "softmax": False,
    }
    ret = requests.post(server_endpoint, json=request_payload_url)

--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@@ -852,7 +852,7 @@
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
-          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
+          "expr": "vllm:kv_cache_usage_perc{model_name=\"$model_name\"}",
          "instant": false,
          "legendFormat": "GPU Cache Usage",
          "range": true,

--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -36,7 +36,6 @@ llm_config = LLMConfig(
    },
    # Set to the node's accelerator type.
    accelerator_type="H100",
-    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
    # Customize engine arguments as required (for example, vLLM engine kwargs).
    engine_kwargs={
        "tensor_parallel_size": 8,

--- a/examples/online_serving/run_cluster.sh
+++ b/examples/online_serving/run_cluster.sh
@@ -83,6 +83,29 @@ else
    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
 fi
+# Parse VLLM_HOST_IP from additional args if present.
+# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
+VLLM_HOST_IP=""
+for arg in "${ADDITIONAL_ARGS[@]}"; do
+    if [[ $arg == "-e" ]]; then
+        continue
+    fi
+    if [[ $arg == VLLM_HOST_IP=* ]]; then
+        VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
+        break
+    fi
+done
+# Build Ray IP environment variables if VLLM_HOST_IP is set.
+# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
+RAY_IP_VARS=()
+if [ -n "${VLLM_HOST_IP}" ]; then
+    RAY_IP_VARS=(
+        -e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
+        -e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
+    )
+fi
 # Launch the container with the assembled parameters.
 # --network host: Allows Ray nodes to communicate directly via host networking
 # --shm-size 10.24g: Increases shared memory
@@ -95,5 +118,6 @@ docker run \
    --shm-size 10.24g \
    --gpus all \
    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+    "${RAY_IP_VARS[@]}" \
    "${ADDITIONAL_ARGS[@]}" \
    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
--- a/examples/online_serving/sagemaker-entrypoint.sh
+++ b/examples/online_serving/sagemaker-entrypoint.sh
@@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
 done < <(env | grep "^${PREFIX}")
 # Pass the collected arguments to the main entrypoint
-exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
+exec vllm serve "${ARGS[@]}"
\ No newline at end of file
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None
        for chunk in response:
            delta = chunk.choices[0].delta
            # Stream reasoning first
-            if reason and hasattr(delta, "reasoning_content") and live_think:
+            if reason and hasattr(delta, "reasoning") and live_think:
-                rc = delta.reasoning_content
+                rc = delta.reasoning
                if rc:
                    think_text += rc
                    live_think.markdown(think_text + "▌")
@@ -262,8 +262,8 @@ def server_supports_reasoning():
        messages=[{"role": "user", "content": "Hi"}],
        stream=False,
    )
-    return hasattr(resp.choices[0].message, "reasoning_content") and bool(
+    return hasattr(resp.choices[0].message, "reasoning") and bool(
-        resp.choices[0].message.reasoning_content
+        resp.choices[0].message.reasoning
    )

--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@@ -21,7 +21,7 @@ If you want to run this script standalone with `uv`, you can use the following:
 ```bash
 uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
-    structured-output
+    structured-outputs
 ```
 See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.

--- a/examples/online_serving/structured_outputs/pyproject.toml
+++ b/examples/online_serving/structured_outputs/pyproject.toml
 [project]
 name = "examples-online-structured-outputs"
-requires-python = ">=3.9, <3.13"
+requires-python = ">=3.10, <3.14"
 dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
 version = "0.0.0"

--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
 # ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
 import argparse
 import asyncio
 import enum
 import os
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Any, Literal
 import openai
 import pydantic
+from openai.types.chat import ChatCompletionChunk
-if TYPE_CHECKING:
-    from openai.types.chat import ChatCompletionChunk
 ConstraintsFormat = Literal[
    "choice",
@@ -39,7 +33,7 @@ async def print_stream_response(
    async for chunk in stream_response:
        delta = chunk.choices[0].delta
-        reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
+        reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
        content_chunk_text = delta.content
        if args.reasoning:
@@ -261,8 +255,8 @@ async def cli():
        for constraint, response in zip(constraints, results):
            print(f"\n\n{constraint}:")
            message = response.choices[0].message
-            if args.reasoning and hasattr(message, "reasoning_content"):
+            if args.reasoning and hasattr(message, "reasoning"):
-                print(f"  Reasoning: {message.reasoning_content or ''}")
+                print(f"  Reasoning: {message.reasoning or ''}")
            print(f"  Content: {message.content!r}")

--- a/examples/online_serving/token_generation_client.py
+++ b/examples/online_serving/token_generation_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import httpx
+from transformers import AutoTokenizer
+GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
+DUMMY_API_KEY = "empty"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+transport = httpx.HTTPTransport()
+headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
+client = httpx.Client(
+    transport=transport,
+    base_url=GEN_ENDPOINT,
+    timeout=600,
+    headers=headers,
+)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "How many countries are in the EU?"},
+]
+def main(client):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
+        "stream": False,
+    }
+    resp = client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    print(data)
+    print("-" * 50)
+    print("Token generation results:")
+    res = tokenizer.decode(data["choices"][0]["token_ids"])
+    print(res)
+    print("-" * 50)
+if __name__ == "__main__":
+    main(client)
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
 from vllm.engine.arg_utils import EngineArgs
-def setup_environment_variables(vllm_version: str):
+def setup_environment_variables():
    # LMCache-related environment variables
    # Use experimental features in LMCache
    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
    os.environ["LMCACHE_LOCAL_CPU"] = "True"
    # Set local CPU memory limit to 5.0 GB
    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
-    if vllm_version == "v0":
-        os.environ["VLLM_USE_V1"] = "0"
 @contextlib.contextmanager
-def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
+def build_llm_with_lmcache(lmcache_connector: str, model: str):
    ktc = KVTransferConfig(
        kv_connector=lmcache_connector,
        kv_role="kv_both",
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # memory. Reduce the value if your GPU has less memory.
    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
-    if vllm_version == "v0":
+    llm_args = EngineArgs(
-        llm_args = EngineArgs(
+        model=model,
-            model=model,
+        kv_transfer_config=ktc,
-            kv_transfer_config=ktc,
+        max_model_len=8000,
-            max_model_len=8000,
+        gpu_memory_utilization=0.8,
-            gpu_memory_utilization=0.8,
+    )
-            enable_chunked_prefill=True,  # Only in v0
-        )
-    else:
-        llm_args = EngineArgs(
-            model=model,
-            kv_transfer_config=ktc,
-            max_model_len=8000,
-            gpu_memory_utilization=0.8,
-        )
    llm = LLM(**asdict(llm_args))
    try:
@@ -116,18 +105,10 @@ def parse_args():
 def main():
-    args = parse_args()
+    lmcache_connector = "LMCacheConnectorV1"
+    model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-    if args.version == "v0":
+    setup_environment_variables()
-        lmcache_connector = "LMCacheConnector"
+    with build_llm_with_lmcache(lmcache_connector, model) as llm:
-        model = "mistralai/Mistral-7B-Instruct-v0.2"
-    else:
-        lmcache_connector = "LMCacheConnectorV1"
-        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-    setup_environment_variables(args.version)
-    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
        # This example script runs two requests with a shared prefix.
        # Define the shared prompt and specific prompts
        shared_prompt = "Hello, how are you?" * 1000

--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -16,13 +16,11 @@ from vllm.model_executor.model_loader.tensorizer import (
    tensorize_vllm_model,
    tensorizer_kwargs_arg,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 logger = logging.getLogger()
-# yapf conflicts with isort for this docstring
-# yapf: disable
 """
 tensorize_vllm_model.py is a script that can be used to serialize and 
 deserialize vLLM models. These models can be loaded using tensorizer 
@@ -86,7 +84,7 @@ directly to load models:
 from vllm import LLM
 llm = LLM(
    "s3://my-bucket/vllm/facebook/opt-125m/v1", 
-    load_format="tensorizer"
+    load_format="tensorizer",
 )
 ```
@@ -132,7 +130,8 @@ def get_parser():
        "can be loaded using tensorizer directly to the GPU "
        "extremely quickly. Tensor encryption and decryption is "
        "also supported, although libsodium must be installed to "
-        "use it.")
+        "use it."
+    )
    parser = EngineArgs.add_cli_args(parser)
    parser.add_argument(
@@ -144,13 +143,14 @@ def get_parser():
        "along with the model by instantiating a TensorizerConfig object, "
        "creating a dict from it with TensorizerConfig.to_serializable(), "
        "and passing it to LoRARequest's initializer with the kwarg "
-        "tensorizer_config_dict."
+        "tensorizer_config_dict.",
    )
-    subparsers = parser.add_subparsers(dest='command', required=True)
+    subparsers = parser.add_subparsers(dest="command", required=True)
    serialize_parser = subparsers.add_parser(
-        'serialize', help="Serialize a model to `--serialized-directory`")
+        "serialize", help="Serialize a model to `--serialized-directory`"
+    )
    serialize_parser.add_argument(
        "--suffix",
@@ -163,7 +163,9 @@ def get_parser():
            "`--suffix` is `v1`, the serialized model tensors will be "
            "saved to "
            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
-            "If none is provided, a random UUID will be used."))
+            "If none is provided, a random UUID will be used."
+        ),
+    )
    serialize_parser.add_argument(
        "--serialized-directory",
        type=str,
@@ -175,108 +177,127 @@ def get_parser():
        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
        "where `suffix` is given by `--suffix` or a random UUID if not "
-        "provided.")
+        "provided.",
+    )
    serialize_parser.add_argument(
        "--serialization-kwargs",
        type=tensorizer_kwargs_arg,
        required=False,
-        help=("A JSON string containing additional keyword arguments to "
+        help=(
-              "pass to Tensorizer's TensorSerializer during "
+            "A JSON string containing additional keyword arguments to "
-              "serialization."))
+            "pass to Tensorizer's TensorSerializer during "
+            "serialization."
+        ),
+    )
    serialize_parser.add_argument(
        "--keyfile",
        type=str,
        required=False,
-        help=("Encrypt the model weights with a randomly-generated binary key,"
+        help=(
-              " and save the key at this path"))
+            "Encrypt the model weights with a randomly-generated binary key,"
+            " and save the key at this path"
+        ),
+    )
    deserialize_parser = subparsers.add_parser(
-        'deserialize',
+        "deserialize",
-        help=("Deserialize a model from `--path-to-tensors`"
+        help=(
-              " to verify it can be loaded and used."))
+            "Deserialize a model from `--path-to-tensors`"
+            " to verify it can be loaded and used."
+        ),
+    )
    deserialize_parser.add_argument(
        "--path-to-tensors",
        type=str,
        required=False,
-        help="The local path or S3 URI to the model tensors to deserialize. ")
+        help="The local path or S3 URI to the model tensors to deserialize. ",
+    )
    deserialize_parser.add_argument(
        "--serialized-directory",
        type=str,
        required=False,
        help="Directory with model artifacts for loading. Assumes a "
-             "model.tensors file exists therein. Can supersede "
+        "model.tensors file exists therein. Can supersede "
-             "--path-to-tensors.")
+        "--path-to-tensors.",
+    )
    deserialize_parser.add_argument(
        "--keyfile",
        type=str,
        required=False,
-        help=("Path to a binary key to use to decrypt the model weights,"
+        help=(
-              " if the model was serialized with encryption"))
+            "Path to a binary key to use to decrypt the model weights,"
+            " if the model was serialized with encryption"
+        ),
+    )
    deserialize_parser.add_argument(
        "--deserialization-kwargs",
        type=tensorizer_kwargs_arg,
        required=False,
-        help=("A JSON string containing additional keyword arguments to "
+        help=(
-              "pass to Tensorizer's `TensorDeserializer` during "
+            "A JSON string containing additional keyword arguments to "
-              "deserialization."))
+            "pass to Tensorizer's `TensorDeserializer` during "
+            "deserialization."
+        ),
+    )
    TensorizerArgs.add_cli_args(deserialize_parser)
    return parser
-def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
-                                              cfg: TensorizerConfig):
+def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
    for k, v in extra_cfg.items():
        if hasattr(cfg, k):
            setattr(cfg, k, v)
            logger.info(
                "Updating TensorizerConfig with %s from "
-                "--model-loader-extra-config provided", k
+                "--model-loader-extra-config provided",
+                k,
            )
 def deserialize(args, tensorizer_config):
    if args.lora_path:
        tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
-        llm = LLM(model=args.model,
+        llm = LLM(
-                  load_format="tensorizer",
+            model=args.model,
-                  tensor_parallel_size=args.tensor_parallel_size,
+            load_format="tensorizer",
-                  model_loader_extra_config=tensorizer_config,
+            tensor_parallel_size=args.tensor_parallel_size,
-                  enable_lora=True,
+            model_loader_extra_config=tensorizer_config,
+            enable_lora=True,
        )
        sampling_params = SamplingParams(
-            temperature=0,
+            temperature=0, max_tokens=256, stop=["[/assistant]"]
-            max_tokens=256,
-            stop=["[/assistant]"]
        )
        # Truncating this as the extra text isn't necessary
-        prompts = [
+        prompts = ["[user] Write a SQL query to answer the question based on ..."]
-            "[user] Write a SQL query to answer the question based on ..."
-        ]
        # Test LoRA load
        print(
            llm.generate(
-            prompts,
+                prompts,
-            sampling_params,
+                sampling_params,
-            lora_request=LoRARequest("sql-lora",
+                lora_request=LoRARequest(
-                                     1,
+                    "sql-lora",
-                                     args.lora_path,
+                    1,
-                                     tensorizer_config_dict = tensorizer_config
+                    args.lora_path,
-                                     .to_serializable())
+                    tensorizer_config_dict=tensorizer_config.to_serializable(),
+                ),
            )
        )
    else:
-        llm = LLM(model=args.model,
+        llm = LLM(
-                  load_format="tensorizer",
+            model=args.model,
-                  tensor_parallel_size=args.tensor_parallel_size,
+            load_format="tensorizer",
-                  model_loader_extra_config=tensorizer_config
+            tensor_parallel_size=args.tensor_parallel_size,
+            model_loader_extra_config=tensorizer_config,
        )
    return llm
@@ -285,17 +306,20 @@ def main():
    parser = get_parser()
    args = parser.parse_args()
-    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
+    s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
-                        or os.environ.get("S3_ACCESS_KEY_ID", None))
+        "S3_ACCESS_KEY_ID", None
-    s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
+    )
-                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
+    s3_secret_access_key = getattr(
-    s3_endpoint = (getattr(args, 's3_endpoint', None)
+        args, "s3_secret_access_key", None
-                or os.environ.get("S3_ENDPOINT_URL", None))
+    ) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
+    s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
+        "S3_ENDPOINT_URL", None
+    )
    credentials = {
        "s3_access_key_id": s3_access_key_id,
        "s3_secret_access_key": s3_secret_access_key,
-        "s3_endpoint": s3_endpoint
+        "s3_endpoint": s3_endpoint,
    }
    model_ref = args.model
@@ -309,25 +333,25 @@ def main():
    if args.model_loader_extra_config:
        extra_config = json.loads(args.model_loader_extra_config)
+    tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
-    tensorizer_dir = (args.serialized_directory or
+    tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
-                      extra_config.get("tensorizer_dir"))
+        "tensorizer_uri"
-    tensorizer_uri = (getattr(args, "path_to_tensors", None)
+    )
-                      or extra_config.get("tensorizer_uri"))
    if tensorizer_dir and tensorizer_uri:
-        parser.error("--serialized-directory and --path-to-tensors "
+        parser.error(
-                     "cannot both be provided")
+            "--serialized-directory and --path-to-tensors cannot both be provided"
+        )
    if not tensorizer_dir and not tensorizer_uri:
-        parser.error("Either --serialized-directory or --path-to-tensors "
+        parser.error(
-                     "must be provided")
+            "Either --serialized-directory or --path-to-tensors must be provided"
+        )
    if args.command == "serialize":
        engine_args = EngineArgs.from_cli_args(args)
-        input_dir = tensorizer_dir.rstrip('/')
+        input_dir = tensorizer_dir.rstrip("/")
        suffix = args.suffix if args.suffix else uuid.uuid4().hex
        base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
        if engine_args.tensor_parallel_size > 1:
@@ -339,15 +363,14 @@ def main():
            tensorizer_uri=model_path,
            encryption_keyfile=keyfile,
            serialization_kwargs=args.serialization_kwargs or {},
-            **credentials
+            **credentials,
        )
        if args.lora_path:
            tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
            tensorize_lora_adapter(args.lora_path, tensorizer_config)
-        merge_extra_config_with_tensorizer_config(extra_config,
+        merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
-                                                  tensorizer_config)
        tensorize_vllm_model(engine_args, tensorizer_config)
    elif args.command == "deserialize":
@@ -356,11 +379,10 @@ def main():
            tensorizer_dir=args.serialized_directory,
            encryption_keyfile=keyfile,
            deserialization_kwargs=args.deserialization_kwargs or {},
-            **credentials
+            **credentials,
        )
-        merge_extra_config_with_tensorizer_config(extra_config,
+        merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
-                                                  tensorizer_config)
        deserialize(args, tensorizer_config)
    else:
        raise ValueError("Either serialize or deserialize must be specified.")

--- a/examples/pyproject.toml
+++ b/examples/pyproject.toml
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-[tool.ruff]
-line-length = 88
-exclude = [
-    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py",
-    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
-]
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-[tool.ruff.lint]
-select = [
-    # pycodestyle
-    "E",
-    # Pyflakes
-    "F",
-    # pyupgrade
-    "UP",
-    # flake8-bugbear
-    "B",
-    # flake8-simplify
-    "SIM",
-    # isort
-    "I",
-    # flake8-logging-format
-    "G",
-]
-ignore = [
-    # star imports
-    "F405", "F403",
-    # lambda expression assignment
-    "E731",
-    # Loop control variable not used within loop body
-    "B007",
-    # f-string format
-    "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
-]
-[tool.ruff.lint.isort]
-known-first-party = ["vllm"]
-[tool.ruff.format]
-docstring-code-format = true
\ No newline at end of file