更新README.md，修改Docker镜像版本和深度学习库依赖，删除多个示例文件以简化代码库。

25f39502 · laibao · 951558c2 · 25f39502 · 25f39502 · 25f39502
Commit 25f39502 authored Aug 16, 2025 by laibao
20 changed files
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import socket
+import threading
+import uuid
+import aiohttp
+import msgpack
+import zmq
+from quart import Quart, make_response, request
+count = 0
+prefill_instances: dict[str, str] = {}  # http_address: zmq_address
+decode_instances: dict[str, str] = {}  # http_address: zmq_address
+prefill_cv = threading.Condition()
+decode_cv = threading.Condition()
+def _listen_for_register(poller, router_socket):
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_address, message = router_socket.recv_multipart()
+            # data: {"type": "P", "http_address": "ip:port",
+            #        "zmq_address": "ip:port"}
+            data = msgpack.loads(message)
+            if data["type"] == "P":
+                global prefill_instances
+                global prefill_cv
+                with prefill_cv:
+                    prefill_instances[data["http_address"]] = data["zmq_address"]
+            elif data["type"] == "D":
+                global decode_instances
+                global decode_cv
+                with decode_cv:
+                    decode_instances[data["http_address"]] = data["zmq_address"]
+            else:
+                print(
+                    "Unexpected, Received message from %s, data: %s",
+                    remote_address,
+                    data,
+                )
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+    context = zmq.Context()
+    router_socket = context.socket(zmq.ROUTER)
+    router_socket.bind(f"tcp://{hostname}:{port}")
+    poller = zmq.Poller()
+    poller.register(router_socket, zmq.POLLIN)
+    _listener_thread = threading.Thread(
+        target=_listen_for_register, args=[poller, router_socket], daemon=True
+    )
+    _listener_thread.start()
+    return _listener_thread
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+app = Quart(__name__)
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+async def forward_request(url, data, request_id):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        async with session.post(url=url, json=data, headers=headers) as response:
+            if response.status == 200:
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+@app.route("/v1/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+        global count
+        global prefill_instances
+        global prefill_cv
+        with prefill_cv:
+            prefill_list = list(prefill_instances.items())
+            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
+        global decode_instances
+        global decode_cv
+        with decode_cv:
+            decode_list = list(decode_instances.items())
+            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
+        print(
+            f"handle_request count: {count}, [HTTP:{prefill_addr}, "
+            f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
+            f"ZMQ:{decode_zmq_addr}]"
+        )
+        count += 1
+        request_id = (
+            f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
+            f"{decode_zmq_addr}_{random_uuid()}"
+        )
+        # finish prefill
+        async for _ in forward_request(
+            f"http://{prefill_addr}/v1/completions", prefill_request, request_id
+        ):
+            continue
+        # return decode
+        generator = forward_request(
+            f"http://{decode_addr}/v1/completions", original_request_data, request_id
+        )
+        response = await make_response(generator)
+        response.timeout = None
+        return response
+    except Exception as e:
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+if __name__ == "__main__":
+    t = start_service_discovery("0.0.0.0", 30001)
+    app.run(host="0.0.0.0", port=10001)
+    t.join()
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example for starting a Gradio OpenAI Chatbot Webserver
+Start vLLM API server:
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+Start Gradio OpenAI Chatbot Webserver:
+    python examples/online_serving/gradio_openai_chatbot_webserver.py \
+                    -m meta-llama/Llama-2-7b-chat-hf
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
+import argparse
+import gradio as gr
+from openai import OpenAI
+def format_history_to_openai(history):
+    history_openai_format = [
+        {"role": "system", "content": "You are a great AI assistant."}
+    ]
+    for human, assistant in history:
+        history_openai_format.append({"role": "user", "content": human})
+        history_openai_format.append({"role": "assistant", "content": assistant})
+    return history_openai_format
+def predict(message, history, client, model_name, temp, stop_token_ids):
+    # Format history to OpenAI chat format
+    history_openai_format = format_history_to_openai(history)
+    history_openai_format.append({"role": "user", "content": message})
+    # Send request to OpenAI API (vLLM server)
+    stream = client.chat.completions.create(
+        model=model_name,
+        messages=history_openai_format,
+        temperature=temp,
+        stream=True,
+        extra_body={
+            "repetition_penalty": 1,
+            "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
+            if stop_token_ids
+            else [],
+        },
+    )
+    # Collect all chunks and concatenate them into a full message
+    full_message = ""
+    for chunk in stream:
+        full_message += chunk.choices[0].delta.content or ""
+    # Return the full message as a single response
+    return full_message
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Chatbot Interface with Customizable Parameters"
+    )
+    parser.add_argument(
+        "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
+    )
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Model name for the chatbot"
+    )
+    parser.add_argument(
+        "--temp", type=float, default=0.8, help="Temperature for text generation"
+    )
+    parser.add_argument(
+        "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
+    )
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    return parser.parse_args()
+def build_gradio_interface(client, model_name, temp, stop_token_ids):
+    def chat_predict(message, history):
+        return predict(message, history, client, model_name, temp, stop_token_ids)
+    return gr.ChatInterface(
+        fn=chat_predict,
+        title="Chatbot Interface",
+        description="A simple chatbot powered by vLLM",
+    )
+def main():
+    # Parse the arguments
+    args = parse_args()
+    # Set OpenAI's API key and API base to use vLLM's API server
+    openai_api_key = "EMPTY"
+    openai_api_base = args.model_url
+    # Create an OpenAI client
+    client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
+    # Define the Gradio chatbot interface using the predict function
+    gradio_interface = build_gradio_interface(
+        client, args.model, args.temp, args.stop_token_ids
+    )
+    gradio_interface.queue().launch(
+        server_name=args.host, server_port=args.port, share=True
+    )
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example for starting a Gradio Webserver
+Start vLLM API server:
+    python -m vllm.entrypoints.api_server \
+        --model meta-llama/Llama-2-7b-chat-hf
+Start Webserver:
+    python examples/online_serving/gradio_webserver.py
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
+import argparse
+import json
+import gradio as gr
+import requests
+def http_bot(prompt):
+    headers = {"User-Agent": "vLLM Client"}
+    pload = {
+        "prompt": prompt,
+        "stream": True,
+        "max_tokens": 128,
+    }
+    response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+    ):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"][0]
+            yield output
+def build_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# vLLM text completion demo\n")
+        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
+        outputbox = gr.Textbox(
+            label="Output", placeholder="Generated result from the model"
+        )
+        inputbox.submit(http_bot, [inputbox], [outputbox])
+    return demo
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument(
+        "--model-url", type=str, default="http://localhost:8000/generate"
+    )
+    return parser.parse_args()
+def main(args):
+    demo = build_demo()
+    demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+Jina and Cohere https://jina.ai/reranker
+run: vllm serve BAAI/bge-reranker-base
+"""
+import json
+import requests
+url = "http://127.0.0.1:8000/rerank"
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+data = {
+    "model": "BAAI/bge-reranker-base",
+    "query": "What is the capital of France?",
+    "documents": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+        "Horses and cows are both animals",
+    ],
+}
+def main():
+    response = requests.post(url, headers=headers, json=data)
+    # Check the response
+    if response.status_code == 200:
+        print("Request successful!")
+        print(json.dumps(response.json(), indent=2))
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional, Union
+import msgspec
+import zmq
+from msgspec.msgpack import Decoder
+#
+# Types copied from vllm.distributed.kv_events
+#
+class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
+    ts: float
+    events: list[Any]
+class KVCacheEvent(
+    msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
+):
+    """Base class for all KV cache-related events"""
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+class AllBlocksCleared(KVCacheEvent):
+    pass
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+def process_event(event_batch):
+    print(f"Received event batch at {event_batch.ts}:")
+    for event in event_batch.events:
+        print(f"  - {event}")
+def main():
+    decoder = Decoder(type=KVEventBatch)
+    last_seq = -1
+    context = zmq.Context()
+    # Set up the main subscription socket
+    sub = context.socket(zmq.SUB)
+    sub.connect("tcp://localhost:5557")
+    topic = "kv-events"
+    sub.setsockopt_string(zmq.SUBSCRIBE, topic)
+    # Initialize replay socket
+    replay = context.socket(zmq.REQ)
+    replay.connect("tcp://localhost:5558")
+    poller = zmq.Poller()
+    poller.register(replay, zmq.POLLIN)
+    print("Listening for KV cache events on topic:", topic)
+    while True:
+        try:
+            if sub.poll(50):
+                _, seq_bytes, payload = sub.recv_multipart()
+                seq = int.from_bytes(seq_bytes, "big")
+                if last_seq >= 0 and seq > last_seq + 1:
+                    missed = seq - last_seq - 1
+                    print(
+                        f"Missed {missed} messages (last: {last_seq}, current: {seq})"
+                    )
+                    replay.send((last_seq + 1).to_bytes(8, "big"))
+                    while poller.poll(timeout=200):
+                        seq_bytes, replay_payload = replay.recv_multipart()
+                        if not replay_payload:
+                            # End of replay marker is sent as an empty frame
+                            # for the payload
+                            break
+                        replay_seq = int.from_bytes(seq_bytes, "big")
+                        if replay_seq > last_seq:
+                            event_batch = decoder.decode(replay_payload)
+                            process_event(event_batch)
+                            last_seq = replay_seq
+                            if replay_seq >= seq - 1:
+                                break
+                event_batch = decoder.decode(payload)
+                process_event(event_batch)
+            # ... do other periodic work or check for shutdown ...
+        except KeyboardInterrupt:
+            print("Interrupted")
+            break
+        except Exception as e:
+            print("Error decoding message:", e)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/multi-node-serving.sh
+++ b/examples/online_serving/multi-node-serving.sh
+#!/bin/bash
+subcommand=$1
+shift
+ray_port=6379
+ray_init_timeout=300
+declare -a start_params
+case "$subcommand" in
+  worker)
+    ray_address=""
+    while [ $# -gt 0 ]; do
+      case "$1" in
+        --ray_address=*)
+          ray_address="${1#*=}"
+          ;;
+        --ray_port=*)
+          ray_port="${1#*=}"
+          ;;
+        --ray_init_timeout=*)
+          ray_init_timeout="${1#*=}"
+          ;;
+        *)
+          start_params+=("$1")
+      esac
+      shift
+    done
+    if [ -z "$ray_address" ]; then
+      echo "Error: Missing argument --ray_address"
+      exit 1
+    fi
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+      ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
+      if [ $? -eq 0 ]; then
+        echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
+        exit 0
+      fi
+      echo "Waiting until the ray worker is active..."
+      sleep 5s;
+    done
+    echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
+    exit 1
+    ;;
+  leader)
+    ray_cluster_size=""
+    while [ $# -gt 0 ]; do
+          case "$1" in
+            --ray_port=*)
+              ray_port="${1#*=}"
+              ;;
+            --ray_cluster_size=*)
+              ray_cluster_size="${1#*=}"
+              ;;
+            --ray_init_timeout=*)
+              ray_init_timeout="${1#*=}"
+              ;;
+            *)
+              start_params+=("$1")
+          esac
+          shift
+    done
+    if [ -z "$ray_cluster_size" ]; then
+      echo "Error: Missing argument --ray_cluster_size"
+      exit 1
+    fi
+    # start the ray daemon
+    ray start --head --port=$ray_port "${start_params[@]}"
+    # wait until all workers are active
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+        active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
+        if [ $active_nodes -eq $ray_cluster_size ]; then
+          echo "All ray workers are active and the ray cluster is initialized successfully."
+          exit 0
+        fi
+        echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
+        sleep 5s;
+    done
+    echo "Waiting for all ray workers to be active timed out."
+    exit 1
+    ;;
+  *)
+    echo "unknown subcommand: $subcommand"
+    exit 1
+    ;;
+esac
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+from typing import Optional
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+"""
+To run this example, run the following commands simultaneously with
+different CUDA_VISIBLE_DEVICES:
+    python examples/online_serving/multi_instance_data_parallel.py
+    vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
+        --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
+        --data-parallel-size-local 1 --enforce-eager --headless
+Once both instances have completed the handshake, this example will
+send a request to the instance with DP rank 1.
+"""
+async def main():
+    engine_args = AsyncEngineArgs(
+        model="ibm-research/PowerMoE-3b",
+        data_parallel_size=2,
+        dtype="auto",
+        max_model_len=2048,
+        data_parallel_address="127.0.0.1",
+        data_parallel_rpc_port=62300,
+        data_parallel_size_local=1,
+        enforce_eager=True,
+    )
+    engine_client = AsyncLLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.9,
+        max_tokens=100,
+    )
+    prompt = "Who won the 2004 World Series?"
+    final_output: Optional[RequestOutput] = None
+    async for output in engine_client.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        request_id="abcdef",
+        data_parallel_rank=1,
+    ):
+        final_output = output
+    if final_output:
+        print(final_output.outputs[0].text)
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for OpenAI Chat Completion using vLLM API server
+NOTE: start a supported chat completion model server with `vllm serve`, e.g.
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+"""
+import argparse
+from openai import OpenAI
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Who won the world series in 2020?"},
+    {
+        "role": "assistant",
+        "content": "The Los Angeles Dodgers won the World Series in 2020.",
+    },
+    {"role": "user", "content": "Where was it played?"},
+]
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model = models.data[0].id
+    # Chat Completion API
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        stream=args.stream,
+    )
+    print("-" * 50)
+    print("Chat completion results:")
+    if args.stream:
+        for c in chat_completion:
+            print(c)
+    else:
+        print(chat_completion)
+    print("-" * 50)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""An example showing how to use vLLM to serve multimodal models
+and run online serving with OpenAI client.
+Launch the vLLM server with the following command:
+(single image inference with Llava)
+vllm serve llava-hf/llava-1.5-7b-hf
+(multi-image inference with Phi-3.5-vision-instruct)
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
+(audio inference with Ultravox)
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
+    --max-model-len 4096 --trust-remote-code
+run the script with
+python openai_chat_completion_client_for_multimodal.py --chat-type audio
+"""
+import base64
+import requests
+from openai import OpenAI
+from utils import get_first_model
+from vllm.utils import FlexibleArgumentParser
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode("utf-8")
+    return result
+# Text-only inference
+def run_text_only(model: str) -> None:
+    chat_completion = client.chat.completions.create(
+        messages=[{"role": "user", "content": "What's the capital of France?"}],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion.choices[0].message.content
+    print("Chat completion output:", result)
+# Single-image input inference
+def run_single_image(model: str) -> None:
+    ## Use image url in the payload
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+    ## Use base64 encoded image in the payload
+    image_base64 = encode_base64_content_from_url(image_url)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+# Multi-image input inference
+def run_multi_image(model: str) -> None:
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What are the animals in these images?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_duck},
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_lion},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output:", result)
+# Video input inference
+def run_video(model: str) -> None:
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    video_base64 = encode_base64_content_from_url(video_url)
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": video_url},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+    ## Use base64 encoded video in the payload
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+# Audio input inference
+def run_audio(model: str) -> None:
+    from vllm.assets.audio import AudioAsset
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            # Any format supported by librosa is supported
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+    # HTTP URL
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            # Any format supported by librosa is supported
+                            "url": audio_url
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
+    # base64 URL
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            # Any format supported by librosa is supported
+                            "url": f"data:audio/ogg;base64,{audio_base64}"
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded audio:", result)
+example_function_map = {
+    "text-only": run_text_only,
+    "single-image": run_single_image,
+    "multi-image": run_multi_image,
+    "video": run_video,
+    "audio": run_audio,
+}
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using OpenAI client for online serving with "
+        "multimodal language models served with vLLM."
+    )
+    parser.add_argument(
+        "--chat-type",
+        "-c",
+        type=str,
+        default="single-image",
+        choices=list(example_function_map.keys()),
+        help="Conversation type with multimodal data.",
+    )
+    return parser.parse_args()
+def main(args) -> None:
+    chat_type = args.chat_type
+    model = get_first_model(client)
+    example_function_map[chat_type](model)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled. For example:
+IMPORTANT: for mistral, you must use one of the provided mistral tool call
+templates, or your own - the model default doesn't work for tool calls with vLLM
+See the vLLM docs on OpenAI server & tool calling for more details.
+vllm serve mistralai/Mistral-7B-Instruct-v0.3 \
+            --chat-template examples/tool_chat_template_mistral.jinja \
+            --enable-auto-tool-choice --tool-call-parser mistral
+OR
+vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \
+            --chat-template examples/tool_chat_template_hermes.jinja \
+            --enable-auto-tool-choice --tool-call-parser hermes
+"""
+import json
+from typing import Any
+from openai import OpenAI
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+properties = {
+    "city": {
+        "type": "string",
+        "description": "The city to find the weather for, e.g. 'San Francisco'",
+    },
+    "state": {
+        "type": "string",
+        "description": "the two-letter abbreviation for the state that the city is"
+        " in, e.g. 'CA' which would mean 'California'",
+    },
+    "unit": {
+        "type": "string",
+        "description": "The unit to fetch the temperature in",
+        "enum": ["celsius", "fahrenheit"],
+    },
+}
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": properties,
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        "The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+        "partly cloudly, with highs in the 90's."
+    )
+def handle_tool_calls_stream(
+    client: OpenAI,
+    messages: list[dict[str, str]],
+    model: str,
+    tools: list[dict[str, Any]],
+) -> list[Any]:
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=True
+    )
+    chunks = []
+    print("chunks: ")
+    for chunk in tool_calls_stream:
+        chunks.append(chunk)
+        if chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls[0])
+        else:
+            print(chunk.choices[0].delta)
+    return chunks
+def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
+    arguments = []
+    tool_call_idx = -1
+    print("arguments: ")
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                if tool_call_idx >= 0:
+                    print(f"streamed tool call arguments: {arguments[tool_call_idx]}")
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+            if tool_call.id:
+                print(f"streamed tool call id: {tool_call.id} ")
+            if tool_call.function:
+                if tool_call.function.name:
+                    print(f"streamed tool call name: {tool_call.function.name}")
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+    return arguments
+def main():
+    # Initialize OpenAI client
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    # Get available models and select one
+    models = client.models.list()
+    model = models.data[0].id
+    chat_completion = client.chat.completions.create(
+        messages=messages, model=model, tools=tools
+    )
+    print("-" * 70)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 70)
+    # Stream tool calls
+    chunks = handle_tool_calls_stream(client, messages, model, tools)
+    print("-" * 70)
+    # Handle arguments from streamed tool calls
+    arguments = handle_tool_calls_arguments(chunks)
+    if len(arguments):
+        print(f"streamed tool call arguments: {arguments[-1]}\n")
+    print("-" * 70)
+    # Add tool call results to the conversation
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": chat_completion.choices[0].message.tool_calls,
+        }
+    )
+    # Now, simulate a tool call
+    available_tools = {"get_current_weather": get_current_weather}
+    completion_tool_calls = chat_completion.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        print("tool_to_call result: ", result)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+    chat_completion_2 = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=False
+    )
+    print("Chat completion2 results:")
+    print(chat_completion_2)
+    print("-" * 70)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+To run this example, you can start the vLLM server
+without any specific flags:
+```bash
+VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+    --guided-decoding-backend outlines
+```
+This example demonstrates how to generate chat completions
+using the OpenAI Python client library.
+"""
+from openai import OpenAI
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": (
+                            "the two-letter abbreviation for the state that the "
+                            "city is in, e.g. 'CA' which would mean 'California'"
+                        ),
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": (
+                            "The city to get the forecast for, e.g. 'New York'"
+                        ),
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": (
+                            "The two-letter abbreviation for the state, e.g. 'NY'"
+                        ),
+                    },
+                    "days": {
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "days", "unit"],
+            },
+        },
+    },
+]
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": "Can you tell me what the current weather is in Dallas \
+            and the forecast for the next 5 days, in fahrenheit?",
+    },
+]
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model = models.data[0].id
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice="required",
+        stream=True,  # Enable streaming response
+    )
+    for chunk in chat_completion:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls)
+    chat_completion = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, tool_choice="required"
+    )
+    print(chat_completion.choices[0].message.tool_calls)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+OR
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+"""
+import json
+import time
+from openai import OpenAI
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+def process_response(response, tool_functions, original_query):
+    """Process a non-streaming response with possible tool calls"""
+    print("\n--- Response Output ---")
+    # Check if the response has content
+    if response.choices[0].message.content:
+        print(f"Content: {response.choices[0].message.content}")
+    # Check if the response has tool calls
+    if response.choices[0].message.tool_calls:
+        print("--------------------------------")
+        print(f"Tool calls: {response.choices[0].message.tool_calls}")
+        print("--------------------------------")
+        # Collect all tool calls and results before making follow-up request
+        tool_results = []
+        assistant_message = {"role": "assistant"}
+        if response.choices[0].message.content:
+            assistant_message["content"] = response.choices[0].message.content
+        assistant_tool_calls = []
+        # Process each tool call
+        for tool_call in response.choices[0].message.tool_calls:
+            function_name = tool_call.function.name
+            function_args = tool_call.function.arguments
+            function_id = tool_call.id
+            print(f"Function called: {function_name}")
+            print(f"Arguments: {function_args}")
+            print(f"Function ID: {function_id}")
+            # Execute the function
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(f"\n--- Function Result ---\n{function_result}\n")
+                # Add tool call to assistant message
+                assistant_tool_calls.append(
+                    {
+                        "id": function_id,
+                        "type": "function",
+                        "function": {"name": function_name, "arguments": function_args},
+                    }
+                )
+                # Add tool result to tool_results
+                tool_results.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+            except Exception as e:
+                print(f"Error executing function: {e}")
+        # Add tool_calls to assistant message
+        assistant_message["tool_calls"] = assistant_tool_calls
+        # Create a follow-up message with all function results
+        follow_up_messages = [
+            {"role": "user", "content": original_query},
+            assistant_message,
+        ]
+        # Add all tool results to the messages
+        follow_up_messages.extend(tool_results)
+        # Get completion with all tool results in a single follow-up
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=False,
+        )
+        print("\n--- Follow-up Response ---")
+        print(follow_up_response.choices[0].message.content)
+        print("--- End Follow-up ---\n")
+    print("--- End Response ---\n")
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+    start_time = time.time()
+    # Create non-streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=False,
+    )
+    # Process the non-streaming response, passing the original query
+    process_response(response, tool_functions, query)
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+    print("\nAll tests completed.")
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+OR
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+This example demonstrates streaming tool calls with xLAM models.
+"""
+import json
+import time
+from openai import OpenAI
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid Python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+def process_stream(response, tool_functions, original_query):
+    """Process a streaming response with possible tool calls"""
+    # Track multiple tool calls
+    tool_calls = {}  # Dictionary to store tool calls by ID
+    current_id = None
+    print("\n--- Stream Output ---")
+    for chunk in response:
+        # Handle tool calls in the stream
+        if chunk.choices[0].delta.tool_calls:
+            for tool_call_chunk in chunk.choices[0].delta.tool_calls:
+                # Get the tool call ID
+                if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
+                    current_id = tool_call_chunk.id
+                    if current_id not in tool_calls:
+                        tool_calls[current_id] = {
+                            "function_name": None,
+                            "function_args": "",
+                            "function_id": current_id,
+                        }
+                # Extract function information as it comes in chunks
+                if (
+                    hasattr(tool_call_chunk, "function")
+                    and current_id
+                    and current_id in tool_calls
+                ):
+                    if (
+                        hasattr(tool_call_chunk.function, "name")
+                        and tool_call_chunk.function.name
+                    ):
+                        tool_calls[current_id]["function_name"] = (
+                            tool_call_chunk.function.name
+                        )
+                        print(f"Function called: {tool_call_chunk.function.name}")
+                    if (
+                        hasattr(tool_call_chunk.function, "arguments")
+                        and tool_call_chunk.function.arguments
+                    ):
+                        tool_calls[current_id]["function_args"] += (
+                            tool_call_chunk.function.arguments
+                        )
+                        print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
+        # Handle regular content in the stream
+        elif chunk.choices[0].delta.content:
+            print(chunk.choices[0].delta.content, end="")
+    print("\n--- End Stream ---\n")
+    # Execute each function call and build messages for follow-up
+    follow_up_messages = [{"role": "user", "content": original_query}]
+    for tool_id, tool_data in tool_calls.items():
+        function_name = tool_data["function_name"]
+        function_args = tool_data["function_args"]
+        function_id = tool_data["function_id"]
+        if function_name and function_args:
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(
+                    f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
+                )
+                # Add the assistant message with tool call
+                follow_up_messages.append(
+                    {
+                        "role": "assistant",
+                        "tool_calls": [
+                            {
+                                "id": function_id,
+                                "type": "function",
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": function_args,
+                                },
+                            }
+                        ],
+                    }
+                )
+                # Add the tool message with function result
+                follow_up_messages.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+            except Exception as e:
+                print(f"Error executing function: {e}")
+    # Only send follow-up if we have results to process
+    if len(follow_up_messages) > 1:
+        # Create a follow-up message with all the function results
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=True,
+        )
+        print("\n--- Follow-up Response ---")
+        for chunk in follow_up_response:
+            if chunk.choices[0].delta.content:
+                print(chunk.choices[0].delta.content, end="")
+        print("\n--- End Follow-up ---\n")
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+    start_time = time.time()
+    # Create streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=True,
+    )
+    # Process the streaming response
+    process_stream(response, tool_functions, query)
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+    print("\nAll tests completed.")
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example demonstrates how to use tool calling with reasoning models 
+like QwQ-32B. The reasoning_content will not be parsed by the tool 
+calling process; only the final output will be parsed.
+To run this example, you need to start the vLLM server with both 
+the reasoning parser and tool calling enabled.
+```bash
+vllm serve Qwen/QwQ-32B \
+     --reasoning-parser deepseek_r1 \
+     --enable-auto-tool-choice --tool-call-parser hermes
+```
+"""
+from openai import OpenAI
+# Now, simulate a tool call
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        "The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+        "partly cloudly, with highs in the 90's."
+    )
+available_tools = {"get_current_weather": get_current_weather}
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+properties = {
+    "city": {
+        "type": "string",
+        "description": "The city to find the weather for, e.g. 'San Francisco'",
+    },
+    "state": {
+        "type": "string",
+        "description": "the two-letter abbreviation for the state that the city is"
+        " in, e.g. 'CA' which would mean 'California'",
+    },
+    "unit": {
+        "type": "string",
+        "description": "The unit to fetch the temperature in",
+        "enum": ["celsius", "fahrenheit"],
+    },
+}
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": properties,
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model = models.data[0].id
+    print("---------Full Generate With Automatic Function Calling-------------")
+    tool_calls = client.chat.completions.create(
+        messages=messages, model=model, tools=tools
+    )
+    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
+    print(
+        f"function arguments: "
+        f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}"
+    )
+    print("----------Stream Generate With Automatic Function Calling-----------")
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=True
+    )
+    chunks = list(tool_calls_stream)
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+    print("----------Full Generate With Named Function Calling-----------------")
+    tool_calls = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
+    )
+    tool_call = tool_calls.choices[0].message.tool_calls[0].function
+    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"function name: {tool_call.name}")
+    print(f"function arguments: {tool_call.arguments}")
+    print("----------Stream Generate With Named Function Calling--------------")
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
+        stream=True,
+    )
+    chunks = list(tool_calls_stream)
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+    print("\n\n")
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+To run this example, you need to start the vLLM server
+with the reasoning parser:
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --reasoning-parser deepseek_r1
+```
+This example demonstrates how to generate chat completions from reasoning models
+using the OpenAI Python client library.
+"""
+from openai import OpenAI
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model = models.data[0].id
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # ruff: noqa: E501
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    response = client.chat.completions.create(model=model, messages=messages)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+    print("reasoning_content for Round 1:", reasoning_content)
+    print("content for Round 1:", content)
+    # Round 2
+    messages.append({"role": "assistant", "content": content})
+    messages.append(
+        {
+            "role": "user",
+            "content": "How many Rs are there in the word 'strawberry'?",
+        }
+    )
+    response = client.chat.completions.create(model=model, messages=messages)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+    print("reasoning_content for Round 2:", reasoning_content)
+    print("content for Round 2:", content)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+To run this example, you need to start the vLLM server with the reasoning
+parser:
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --reasoning-parser deepseek_r1
+```
+Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
+streaming chat completions feature.
+The streaming chat completions feature allows you to receive chat completions
+in real-time as they are generated by the model. This is useful for scenarios
+where you want to display chat completions to the user as they are generated
+by the model.
+Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
+content may not exist leading to errors if you try to access it.
+"""
+from openai import OpenAI
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model = models.data[0].id
+    # ruff: noqa: E501
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    stream = client.chat.completions.create(model=model, messages=messages, stream=True)
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import base64
+import io
+import requests
+from PIL import Image
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+def vlm2vec():
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "TIGER-Lab/VLM2Vec-Full",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {"type": "text", "text": "Represent the given image."},
+                    ],
+                }
+            ],
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+    print("Embedding output:", response_json["data"][0]["embedding"])
+def dse_qwen2_vl(inp: dict):
+    # Embedding an Image
+    if inp["type"] == "image":
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": inp["image_url"],
+                        },
+                    },
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            }
+        ]
+    # Embedding a Text Query
+    else:
+        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
+        # of the minimum input size
+        buffer = io.BytesIO()
+        image_placeholder = Image.new("RGB", (56, 56))
+        image_placeholder.save(buffer, "png")
+        buffer.seek(0)
+        image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_placeholder}",
+                        },
+                    },
+                    {"type": "text", "text": f"Query: {inp['content']}"},
+                ],
+            }
+        ]
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "MrLight/dse-qwen2-2b-mrl-v1",
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+    print("Embedding output:", response_json["data"][0]["embedding"])
+def parse_args():
+    parser = argparse.ArgumentParser(
+        "Script to call a specified VLM through the API. Make sure to serve "
+        "the model with --task embed before running this."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=["vlm2vec", "dse_qwen2_vl"],
+        required=True,
+        help="Which model to call.",
+    )
+    return parser.parse_args()
+def main(args):
+    if args.model == "vlm2vec":
+        vlm2vec()
+    elif args.model == "dse_qwen2_vl":
+        dse_qwen2_vl(
+            {
+                "type": "image",
+                "image_url": image_url,
+            }
+        )
+        dse_qwen2_vl(
+            {
+                "type": "text",
+                "content": "What is the weather like today?",
+            }
+        )
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_classification_client.py
+++ b/examples/online_serving/openai_classification_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import pprint
+import requests
+def post_http_request(payload: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=payload)
+    return response
+def parse_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument("--host", type=str, default="localhost")
+    parse.add_argument("--port", type=int, default=8000)
+    parse.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach")
+    return parse.parse_args()
+def main(args):
+    host = args.host
+    port = args.port
+    model_name = args.model
+    api_url = f"http://{host}:{port}/classify"
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    payload = {
+        "model": model_name,
+        "input": prompts,
+    }
+    classify_response = post_http_request(payload=payload, api_url=api_url)
+    pprint.pprint(classify_response.json())
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+from openai import OpenAI
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model = models.data[0].id
+    # Completion API
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=args.stream,
+        logprobs=3,
+    )
+    print("-" * 50)
+    print("Completion results:")
+    if args.stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example online usage of Score API.
+Run `vllm serve <model> --task score` to start up the server in vLLM.
+"""
+import argparse
+import pprint
+import requests
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+    return parser.parse_args()
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/score"
+    model_name = args.model
+    text_1 = "What is the capital of Brazil?"
+    text_2 = "The capital of Brazil is Brasilia."
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("\nPrompt when text_1 and text_2 are both strings:")
+    pprint.pprint(prompt)
+    print("\nScore Response:")
+    pprint.pprint(score_response.json())
+    text_1 = "What is the capital of France?"
+    text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("\nPrompt when text_1 is string and text_2 is a list:")
+    pprint.pprint(prompt)
+    print("\nScore Response:")
+    pprint.pprint(score_response.json())
+    text_1 = ["What is the capital of Brazil?", "What is the capital of France?"]
+    text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("\nPrompt when text_1 and text_2 are both lists:")
+    pprint.pprint(prompt)
+    print("\nScore Response:")
+    pprint.pprint(score_response.json())
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)