Merge tag 'v0.9.2' into v0.9.2-ori

99324e25 · zhuwenwen · cc7f22a8 · a5dd03c1 · 99324e25 · 99324e25
Commit 99324e25 authored Jul 12, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -94,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
        task="embed",
+        max_model_len=4096,
        trust_remote_code=True,
        mm_processor_kwargs={"num_crops": 4},
        limit_mm_per_prompt={"image": 1},

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -289,6 +289,106 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
+    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
+    # it will generate poor response for multi-image inputs!
+    model_name = "llava-hf/llava-1.5-7b-hf"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=16384,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

@@ -323,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

@@ -368,6 +505,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
+        ignore_patterns=["consolidated.safetensors"],
    )

    placeholders = "[IMG]" * len(image_urls)
@@ -728,6 +866,32 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "omni-research/Tarsier2-Recap-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=32768,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+    )
+
+    prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
+        f"<|vision_end|>{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 model_example_map = {
    "aria": load_aria,
    "aya_vision": load_aya_vision,
@@ -736,7 +900,11 @@ model_example_map = {
    "h2ovl_chat": load_h2ovl,
    "idefics3": load_idefics3,
    "internvl_chat": load_internvl,
+    "keye_vl": load_keye_vl,
    "kimi_vl": load_kimi_vl,
+    "llava": load_llava,
+    "llava-next": load_llava_next,
+    "llava-onevision": load_llava_onevision,
    "llama4": load_llama4,
    "mistral3": load_mistral3,
    "mllama": load_mllama,
@@ -750,6 +918,7 @@ model_example_map = {
    "qwen2_5_vl": load_qwen2_5_vl,
    "smolvlm": load_smolvlm,
    "tarsier": load_tarsier,
+    "tarsier2": load_tarsier2,
 }



--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+#!/bin/bash
+
+# =============================================================================
+# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
+# =============================================================================
+# This script demonstrates disaggregated prefill and decode serving using
+# P2P NCCL communication. The architecture supports various XpYd configurations:
+#
+# - 1P3D: 1 Prefill server + 3 Decode servers (current default)
+# - 3P1D: 3 Prefill servers + 1 Decode server
+# - etc.
+#
+# Configuration can be customized via environment variables:
+#   MODEL: Model to serve
+#   PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
+#   DECODE_GPUS: Comma-separated GPU IDs for decode servers
+#   PREFILL_PORTS: Comma-separated ports for prefill servers
+#   DECODE_PORTS: Comma-separated ports for decode servers
+#   PROXY_PORT: Proxy server port used to setup XpYd connection.
+#   TIMEOUT_SECONDS: Server startup timeout
+# =============================================================================
+
+# Configuration - can be overridden via environment variables
+MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
+TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
+PROXY_PORT=${PROXY_PORT:-30001}
+
+# Default 1P3D configuration (1 Prefill + 3 Decode)
+PREFILL_GPUS=${PREFILL_GPUS:-0}
+DECODE_GPUS=${DECODE_GPUS:-1,2,3}
+PREFILL_PORTS=${PREFILL_PORTS:-20003}
+DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} 
+
+echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
+echo ""
+echo "Architecture Configuration:"
+echo "  Model: $MODEL"
+echo "  Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"
+echo "  Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
+echo "  Proxy Port: $PROXY_PORT"
+echo "  Timeout: ${TIMEOUT_SECONDS}s"
+echo ""
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_required_files() {
+    local files=("disagg_proxy_p2p_nccl_xpyd.py")
+    for file in "${files[@]}"; do
+        if [[ ! -f "$file" ]]; then
+            echo "Required file $file not found in $(pwd)"
+            exit 1
+        fi
+    done
+}
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        echo "Example: export HF_TOKEN=your_token_here"
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # Check if the number of GPUs are >=2 via nvidia-smi
+    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    if ! python3 -c "import $1" > /dev/null 2>&1; then
+        echo "$1 is not installed. Please install it via pip install $1."
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    kill -- -$$            # negative PID  ==  "this whole process-group"
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=$TIMEOUT_SECONDS
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      echo "Server on port $port is ready."
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server on port $port"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+main() {
+    check_required_files
+    check_hf_token
+    check_num_gpus
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+    ensure_python_library_installed quart
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM
+
+    echo "Launching disaggregated serving components..."
+    echo "Please check the log files for detailed output:"
+    echo "  - prefill*.log: Prefill server logs"
+    echo "  - decode*.log: Decode server logs"
+    echo "  - proxy.log: Proxy server log"
+
+    # =============================================================================
+    # Launch Proxy Server
+    # =============================================================================
+    echo ""
+    echo "Starting proxy server on port $PROXY_PORT..."
+    python3 disagg_proxy_p2p_nccl_xpyd.py &
+    PIDS+=($!)
+
+    # Parse GPU and port arrays
+    IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
+    IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
+    IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
+    IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
+
+    # =============================================================================
+    # Launch Prefill Servers (X Producers)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
+    for i in "${!PREFILL_GPU_ARRAY[@]}"; do
+        local gpu_id=${PREFILL_GPU_ARRAY[$i]}
+        local port=${PREFILL_PORT_ARRAY[$i]}
+        local kv_port=$((21001 + i))
+        
+        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
+        CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
+        --enforce-eager \
+        --host 0.0.0.0 \
+        --port $port \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
+        PIDS+=($!)
+    done
+
+    # =============================================================================
+    # Launch Decode Servers (Y Decoders)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
+    for i in "${!DECODE_GPU_ARRAY[@]}"; do
+        local gpu_id=${DECODE_GPU_ARRAY[$i]}
+        local port=${DECODE_PORT_ARRAY[$i]}
+        local kv_port=$((22001 + i))
+        
+        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
+        VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        --enforce-eager \
+        --host 0.0.0.0 \
+        --port $port \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
+        PIDS+=($!)
+    done
+
+    # =============================================================================
+    # Wait for All Servers to Start
+    # =============================================================================
+    echo ""
+    echo "Waiting for all servers to start..."
+    for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
+        if ! wait_for_server $port; then
+            echo "Failed to start server on port $port"
+            cleanup
+            exit 1
+        fi
+    done
+
+    echo ""
+    echo "All servers are up. Starting benchmark..."
+
+    # =============================================================================
+    # Run Benchmark
+    # =============================================================================
+    cd ../../../benchmarks/
+    python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
+        --model $MODEL \
+        --dataset-name random --random-input-len 7500 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+}
+
+main
\ No newline at end of file
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import socket
+import threading
+import uuid
+
+import aiohttp
+import msgpack
+import zmq
+from quart import Quart, make_response, request
+
+count = 0
+prefill_instances: dict[str, str] = {}  # http_address: zmq_address
+decode_instances: dict[str, str] = {}  # http_address: zmq_address
+
+prefill_cv = threading.Condition()
+decode_cv = threading.Condition()
+
+
+def _listen_for_register(poller, router_socket):
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_address, message = router_socket.recv_multipart()
+            # data: {"type": "P", "http_address": "ip:port",
+            #        "zmq_address": "ip:port"}
+            data = msgpack.loads(message)
+            if data["type"] == "P":
+                global prefill_instances
+                global prefill_cv
+                with prefill_cv:
+                    prefill_instances[data["http_address"]] = data["zmq_address"]
+            elif data["type"] == "D":
+                global decode_instances
+                global decode_cv
+                with decode_cv:
+                    decode_instances[data["http_address"]] = data["zmq_address"]
+            else:
+                print(
+                    "Unexpected, Received message from %s, data: %s",
+                    remote_address,
+                    data,
+                )
+
+
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+
+    context = zmq.Context()
+    router_socket = context.socket(zmq.ROUTER)
+    router_socket.bind(f"tcp://{hostname}:{port}")
+
+    poller = zmq.Poller()
+    poller.register(router_socket, zmq.POLLIN)
+
+    _listener_thread = threading.Thread(
+        target=_listen_for_register, args=[poller, router_socket], daemon=True
+    )
+    _listener_thread.start()
+    return _listener_thread
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+
+
+async def forward_request(url, data, request_id):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        async with session.post(url=url, json=data, headers=headers) as response:
+            if response.status == 200:
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
+
+@app.route("/v1/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+
+        global count
+        global prefill_instances
+        global prefill_cv
+        with prefill_cv:
+            prefill_list = list(prefill_instances.items())
+            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
+
+        global decode_instances
+        global decode_cv
+        with decode_cv:
+            decode_list = list(decode_instances.items())
+            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
+
+        print(
+            f"handle_request count: {count}, [HTTP:{prefill_addr}, "
+            f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
+            f"ZMQ:{decode_zmq_addr}]"
+        )
+        count += 1
+
+        request_id = (
+            f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
+            f"{decode_zmq_addr}_{random_uuid()}"
+        )
+
+        # finish prefill
+        async for _ in forward_request(
+            f"http://{prefill_addr}/v1/completions", prefill_request, request_id
+        ):
+            continue
+
+        # return decode
+        generator = forward_request(
+            f"http://{decode_addr}/v1/completions", original_request_data, request_id
+        )
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == "__main__":
+    t = start_service_discovery("0.0.0.0", 30001)
+    app.run(host="0.0.0.0", port=10001)
+    t.join()
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from typing import Optional


--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+OR
+
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+"""
+
+import json
+import time
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+
+
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+
+
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+
+
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+
+
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+
+
+def process_response(response, tool_functions, original_query):
+    """Process a non-streaming response with possible tool calls"""
+
+    print("\n--- Response Output ---")
+
+    # Check if the response has content
+    if response.choices[0].message.content:
+        print(f"Content: {response.choices[0].message.content}")
+
+    # Check if the response has tool calls
+    if response.choices[0].message.tool_calls:
+        print("--------------------------------")
+        print(f"Tool calls: {response.choices[0].message.tool_calls}")
+        print("--------------------------------")
+
+        # Collect all tool calls and results before making follow-up request
+        tool_results = []
+        assistant_message = {"role": "assistant"}
+
+        if response.choices[0].message.content:
+            assistant_message["content"] = response.choices[0].message.content
+
+        assistant_tool_calls = []
+
+        # Process each tool call
+        for tool_call in response.choices[0].message.tool_calls:
+            function_name = tool_call.function.name
+            function_args = tool_call.function.arguments
+            function_id = tool_call.id
+
+            print(f"Function called: {function_name}")
+            print(f"Arguments: {function_args}")
+            print(f"Function ID: {function_id}")
+
+            # Execute the function
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(f"\n--- Function Result ---\n{function_result}\n")
+
+                # Add tool call to assistant message
+                assistant_tool_calls.append(
+                    {
+                        "id": function_id,
+                        "type": "function",
+                        "function": {"name": function_name, "arguments": function_args},
+                    }
+                )
+
+                # Add tool result to tool_results
+                tool_results.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+
+            except Exception as e:
+                print(f"Error executing function: {e}")
+
+        # Add tool_calls to assistant message
+        assistant_message["tool_calls"] = assistant_tool_calls
+
+        # Create a follow-up message with all function results
+        follow_up_messages = [
+            {"role": "user", "content": original_query},
+            assistant_message,
+        ]
+
+        # Add all tool results to the messages
+        follow_up_messages.extend(tool_results)
+
+        # Get completion with all tool results in a single follow-up
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=False,
+        )
+
+        print("\n--- Follow-up Response ---")
+        print(follow_up_response.choices[0].message.content)
+        print("--- End Follow-up ---\n")
+
+    print("--- End Response ---\n")
+
+
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+
+    start_time = time.time()
+
+    # Create non-streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=False,
+    )
+
+    # Process the non-streaming response, passing the original query
+    process_response(response, tool_functions, query)
+
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+
+
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+
+    print("\nAll tests completed.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+OR
+
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+This example demonstrates streaming tool calls with xLAM models.
+"""
+
+import json
+import time
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+
+
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+
+
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+
+
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+
+
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid Python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+
+
+def process_stream(response, tool_functions, original_query):
+    """Process a streaming response with possible tool calls"""
+    # Track multiple tool calls
+    tool_calls = {}  # Dictionary to store tool calls by ID
+
+    current_id = None
+
+    print("\n--- Stream Output ---")
+    for chunk in response:
+        # Handle tool calls in the stream
+        if chunk.choices[0].delta.tool_calls:
+            for tool_call_chunk in chunk.choices[0].delta.tool_calls:
+                # Get the tool call ID
+                if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
+                    current_id = tool_call_chunk.id
+                    if current_id not in tool_calls:
+                        tool_calls[current_id] = {
+                            "function_name": None,
+                            "function_args": "",
+                            "function_id": current_id,
+                        }
+
+                # Extract function information as it comes in chunks
+                if (
+                    hasattr(tool_call_chunk, "function")
+                    and current_id
+                    and current_id in tool_calls
+                ):
+                    if (
+                        hasattr(tool_call_chunk.function, "name")
+                        and tool_call_chunk.function.name
+                    ):
+                        tool_calls[current_id]["function_name"] = (
+                            tool_call_chunk.function.name
+                        )
+                        print(f"Function called: {tool_call_chunk.function.name}")
+
+                    if (
+                        hasattr(tool_call_chunk.function, "arguments")
+                        and tool_call_chunk.function.arguments
+                    ):
+                        tool_calls[current_id]["function_args"] += (
+                            tool_call_chunk.function.arguments
+                        )
+                        print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
+
+        # Handle regular content in the stream
+        elif chunk.choices[0].delta.content:
+            print(chunk.choices[0].delta.content, end="")
+
+    print("\n--- End Stream ---\n")
+
+    # Execute each function call and build messages for follow-up
+    follow_up_messages = [{"role": "user", "content": original_query}]
+
+    for tool_id, tool_data in tool_calls.items():
+        function_name = tool_data["function_name"]
+        function_args = tool_data["function_args"]
+        function_id = tool_data["function_id"]
+
+        if function_name and function_args:
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(
+                    f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
+                )
+
+                # Add the assistant message with tool call
+                follow_up_messages.append(
+                    {
+                        "role": "assistant",
+                        "tool_calls": [
+                            {
+                                "id": function_id,
+                                "type": "function",
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": function_args,
+                                },
+                            }
+                        ],
+                    }
+                )
+
+                # Add the tool message with function result
+                follow_up_messages.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+
+            except Exception as e:
+                print(f"Error executing function: {e}")
+
+    # Only send follow-up if we have results to process
+    if len(follow_up_messages) > 1:
+        # Create a follow-up message with all the function results
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=True,
+        )
+
+        print("\n--- Follow-up Response ---")
+        for chunk in follow_up_response:
+            if chunk.choices[0].delta.content:
+                print(chunk.choices[0].delta.content, end="")
+        print("\n--- End Follow-up ---\n")
+
+
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+
+    start_time = time.time()
+
+    # Create streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=True,
+    )
+
+    # Process the streaming response
+    process_stream(response, tool_functions, query)
+
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+
+
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+
+    print("\nAll tests completed.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-To run this example, you need to start the vLLM server:
-
-```bash
-vllm serve Qwen/Qwen2.5-3B-Instruct
-```
-"""
-
-from enum import Enum
-
-from openai import BadRequestError, OpenAI
-from pydantic import BaseModel
-
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-
-# Guided decoding by Choice (list of possible options)
-def guided_choice_completion(client: OpenAI, model: str):
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-        ],
-        extra_body={"guided_choice": ["positive", "negative"]},
-    )
-    return completion.choices[0].message.content
-
-
-# Guided decoding by Regex
-def guided_regex_completion(client: OpenAI, model: str):
-    prompt = (
-        "Generate an email address for Alan Turing, who works in Enigma."
-        "End in .com and new line. Example result:"
-        "alan.turing@enigma.com\n"
-    )
-
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
-    )
-    return completion.choices[0].message.content
-
-
-# Guided decoding by JSON using Pydantic schema
-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
-
-
-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
-
-
-def guided_json_completion(client: OpenAI, model: str):
-    json_schema = CarDescription.model_json_schema()
-
-    prompt = (
-        "Generate a JSON with the brand, model and car_type of"
-        "the most iconic car from the 90's"
-    )
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_json": json_schema},
-    )
-    return completion.choices[0].message.content
-
-
-# Guided decoding by Grammar
-def guided_grammar_completion(client: OpenAI, model: str):
-    simplified_sql_grammar = """
-        root ::= select_statement
-
-        select_statement ::= "SELECT " column " from " table " where " condition
-
-        column ::= "col_1 " | "col_2 "
-
-        table ::= "table_1 " | "table_2 "
-
-        condition ::= column "= " number
-
-        number ::= "1 " | "2 "
-    """
-
-    prompt = (
-        "Generate an SQL query to show the 'username' and 'email'"
-        "from the 'users' table."
-    )
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
-    )
-    return completion.choices[0].message.content
-
-
-# Extra backend options
-def extra_backend_options_completion(client: OpenAI, model: str):
-    prompt = (
-        "Generate an email address for Alan Turing, who works in Enigma."
-        "End in .com and new line. Example result:"
-        "alan.turing@enigma.com\n"
-    )
-
-    try:
-        # The guided_decoding_disable_fallback option forces vLLM to use
-        # xgrammar, so when it fails you get a 400 with the reason why
-        completion = client.chat.completions.create(
-            model=model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": prompt,
-                }
-            ],
-            extra_body={
-                "guided_regex": r"\w+@\w+\.com\n",
-                "stop": ["\n"],
-                "guided_decoding_disable_fallback": True,
-            },
-        )
-        return completion.choices[0].message.content
-    except BadRequestError as e:
-        print("This error is expected:", e)
-
-
-def main():
-    client: OpenAI = OpenAI(
-        base_url=openai_api_base,
-        api_key=openai_api_key,
-    )
-
-    model = client.models.list().data[0].id
-
-    print("Guided Choice Completion:")
-    print(guided_choice_completion(client, model))
-
-    print("\nGuided Regex Completion:")
-    print(guided_regex_completion(client, model))
-
-    print("\nGuided JSON Completion:")
-    print(guided_json_completion(client, model))
-
-    print("\nGuided Grammar Completion:")
-    print(guided_grammar_completion(client, model))
-
-    print("\nExtra Backend Options Completion:")
-    print(extra_backend_options_completion(client, model))
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from openai import OpenAI
-
-# This example demonstrates the `structural_tag` response format.
-# It can be used to specify a structured output format that occurs between
-# specific tags in the response. This example shows how it could be used
-# to enforce the format of a tool call response, but it could be used for
-# any structured output within a subset of the response.
-
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-
-def main():
-    client = OpenAI(
-        base_url=openai_api_base,
-        api_key=openai_api_key,
-    )
-
-    messages = [
-        {
-            "role": "user",
-            "content": """
-You have access to the following function to retrieve the weather in a city:
-
-    {
-        "name": "get_weather",
-        "parameters": {
-            "city": {
-                "param_type": "string",
-                "description": "The city to get the weather for",
-                "required": True
-            }
-        }
-    }
-
-If a you choose to call a function ONLY reply in the following format:
-<{start_tag}={function_name}>{parameters}{end_tag}
-where
-
-start_tag => `<function`
-parameters => a JSON dict with the function argument name as key and function
-              argument value as value.
-end_tag => `</function>`
-
-Here is an example,
-<function=example_function_name>{"example_name": "example_value"}</function>
-
-Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
-
-You are a helpful assistant.
-
-Given the previous instructions, what is the weather in New York City, Boston,
-and San Francisco?
-""",
-        }
-    ]
-
-    response = client.chat.completions.create(
-        model=client.models.list().data[0].id,
-        messages=messages,
-        response_format={
-            "type": "structural_tag",
-            "structures": [
-                {
-                    "begin": "<function=get_weather>",
-                    "schema": {
-                        "type": "object",
-                        "properties": {"city": {"type": "string"}},
-                    },
-                    "end": "</function>",
-                }
-            ],
-            "triggers": ["<function="],
-        },
-    )
-    print(response)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-An example shows how to generate structured outputs from reasoning models
-like DeepSeekR1. The thinking process will not be guided by the JSON
-schema provided by the user. Only the final output will be structured.
-
-To run this example, you need to start the vLLM server with the reasoning
-parser:
-
-```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --reasoning-parser deepseek_r1
-```
-
-This example demonstrates how to generate chat completions from reasoning models
-using the OpenAI Python client library.
-"""
-
-from enum import Enum
-
-from openai import OpenAI
-from pydantic import BaseModel
-
-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-
-def print_completion_details(completion):
-    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-    print("content: ", completion.choices[0].message.content)
-
-
-# Guided decoding by Regex
-def guided_regex_completion(client: OpenAI, model: str):
-    prompt = "What is the capital of France?"
-
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={
-            "guided_regex": "(Paris|London)",
-        },
-    )
-    print_completion_details(completion)
-
-
-class People(BaseModel):
-    name: str
-    age: int
-
-
-def guided_json_completion(client: OpenAI, model: str):
-    json_schema = People.model_json_schema()
-
-    prompt = "Generate a JSON with the name and age of one random person."
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_json": json_schema},
-    )
-    print_completion_details(completion)
-
-
-# Guided decoding by JSON using Pydantic schema
-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
-
-
-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
-
-
-def guided_car_json_completion(client: OpenAI, model: str):
-    json_schema = CarDescription.model_json_schema()
-
-    prompt = (
-        "Generate a JSON with the brand, model and car_type of"
-        "the most iconic car from the 90's"
-    )
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_json": json_schema},
-    )
-    print_completion_details(completion)
-
-
-# Guided decoding by Grammar
-def guided_grammar_completion(client: OpenAI, model: str):
-    simplified_sql_grammar = """
-        root ::= select_statement
-
-        select_statement ::= "SELECT " column " from " table " where " condition
-
-        column ::= "col_1 " | "col_2 "
-
-        table ::= "table_1 " | "table_2 "
-
-        condition ::= column "= " number
-
-        number ::= "1 " | "2 "
-    """
-
-    # This may be very slow https://github.com/vllm-project/vllm/issues/12122
-    prompt = (
-        "Generate an SQL query to show the 'username' and 'email'"
-        "from the 'users' table."
-    )
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
-    )
-    print_completion_details(completion)
-
-
-def main():
-    client: OpenAI = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    models = client.models.list()
-    model: str = models.data[0].id
-
-    print("Guided Regex Completion:")
-    guided_regex_completion(client, model)
-
-    print("\nGuided JSON Completion (People):")
-    guided_json_completion(client, model)
-
-    print("\nGuided JSON Completion (CarDescription):")
-    guided_car_json_completion(client, model)
-
-    print("\nGuided Grammar Completion:")
-    guided_grammar_completion(client, model)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import asyncio
-import json
+"""
+This script demonstrates how to use the vLLM API server to perform audio
+transcription with the `openai/whisper-large-v3` model.

-import httpx
-from openai import OpenAI
+Before running this script, you must start the vLLM server with the following command:

-from vllm.assets.audio import AudioAsset
+    vllm serve openai/whisper-large-v3
+
+Requirements:
+- vLLM with audio support
+- openai Python SDK
+- httpx for streaming support
+
+The script performs:
+1. Synchronous transcription using OpenAI-compatible API.
+2. Streaming transcription using raw HTTP request to the vLLM server.
+"""
+
+import asyncio

-mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path()
-winning_call = AudioAsset("winning_call").get_local_path()
+from openai import AsyncOpenAI, OpenAI

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+from vllm.assets.audio import AudioAsset


-def sync_openai():
-    with open(str(mary_had_lamb), "rb") as f:
+def sync_openai(audio_path: str, client: OpenAI):
+    """
+    Perform synchronous transcription using OpenAI-compatible API.
+    """
+    with open(audio_path, "rb") as f:
        transcription = client.audio.transcriptions.create(
            file=f,
            model="openai/whisper-large-v3",
@@ -37,38 +45,53 @@ def sync_openai():
        print("transcription result:", transcription.text)


-sync_openai()
-
-
-# OpenAI Transcription API client does not support streaming.
-async def stream_openai_response():
-    data = {
-        "language": "en",
-        "stream": True,
-        "model": "openai/whisper-large-v3",
-    }
-    url = openai_api_base + "/audio/transcriptions"
-    headers = {"Authorization": f"Bearer {openai_api_key}"}
-    print("transcription result:", end=" ")
-    async with httpx.AsyncClient() as client:
-        with open(str(winning_call), "rb") as f:
-            async with client.stream(
-                "POST", url, files={"file": f}, data=data, headers=headers
-            ) as response:
-                async for line in response.aiter_lines():
-                    # Each line is a JSON object prefixed with 'data: '
-                    if line:
-                        if line.startswith("data: "):
-                            line = line[len("data: ") :]
-                        # Last chunk, stream ends
-                        if line.strip() == "[DONE]":
-                            break
-                        # Parse the JSON response
-                        chunk = json.loads(line)
-                        # Extract and print the content
-                        content = chunk["choices"][0].get("delta", {}).get("content")
-                        print(content, end="")
-
-
-# Run the asynchronous function
-asyncio.run(stream_openai_response())
+async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
+    """
+    Perform asynchronous transcription using OpenAI-compatible API.
+    """
+    print("\ntranscription result:", end=" ")
+    with open(audio_path, "rb") as f:
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            language="en",
+            response_format="json",
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=420,
+                top_p=0.6,
+            ),
+            stream=True,
+        )
+        async for chunk in transcription:
+            if chunk.choices:
+                content = chunk.choices[0].get("delta", {}).get("content")
+                print(content, end="", flush=True)
+
+    print()  # Final newline after stream ends
+
+
+def main():
+    mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
+    winning_call = str(AudioAsset("winning_call").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    sync_openai(mary_had_lamb, client)
+    # Run the asynchronous function
+    client = AsyncOpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    asyncio.run(stream_openai_response(winning_call, client))
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_translation_client.py
+++ b/examples/online_serving/openai_translation_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import json
+
+import httpx
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+
+def sync_openai(audio_path: str, client: OpenAI):
+    with open(audio_path, "rb") as f:
+        translation = client.audio.translations.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            response_format="json",
+            temperature=0.0,
+            # Additional params not provided by OpenAI API.
+            extra_body=dict(
+                language="it",
+                seed=4419,
+                repetition_penalty=1.3,
+            ),
+        )
+        print("translation result:", translation.text)
+
+
+async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+    data = {
+        "language": "it",
+        "stream": True,
+        "model": "openai/whisper-large-v3",
+    }
+    url = base_url + "/audio/translations"
+    headers = {"Authorization": f"Bearer {api_key}"}
+    print("translation result:", end=" ")
+    # OpenAI translation API client does not support streaming.
+    async with httpx.AsyncClient() as client:
+        with open(audio_path, "rb") as f:
+            async with client.stream(
+                "POST", url, files={"file": f}, data=data, headers=headers
+            ) as response:
+                async for line in response.aiter_lines():
+                    # Each line is a JSON object prefixed with 'data: '
+                    if line:
+                        if line.startswith("data: "):
+                            line = line[len("data: ") :]
+                        # Last chunk, stream ends
+                        if line.strip() == "[DONE]":
+                            break
+                        # Parse the JSON response
+                        chunk = json.loads(line)
+                        # Extract and print the content
+                        content = chunk["choices"][0].get("delta", {}).get("content")
+                        print(content, end="")
+
+
+def main():
+    foscolo = str(AudioAsset("azacinto_foscolo").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    sync_openai(foscolo, client)
+    # Run the asynchronous function
+    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/opentelemetry/README.md
+++ b/examples/online_serving/opentelemetry/README.md
@@ -2,7 +2,7 @@

 1. Install OpenTelemetry packages:

-    ```console
+    ```bash
    pip install \
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
      'opentelemetry-api>=1.26.0,<1.27.0' \
@@ -12,7 +12,7 @@

 1. Start Jaeger in a docker container:

-    ```console
+    ```bash
    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
    docker run --rm --name jaeger \
        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
@@ -31,14 +31,14 @@

 1. In a new shell, export Jaeger IP:

-    ```console
+    ```bash
    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
    ```

    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:

-    ```console
+    ```bash
    export OTEL_SERVICE_NAME="vllm-server"
    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
@@ -46,7 +46,7 @@

 1. In a new shell, send requests with trace context from a dummy client

-    ```console
+    ```bash
    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
@@ -67,7 +67,7 @@
 OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
 By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:

-```console
+```bash
 export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
 vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
@@ -79,13 +79,13 @@ OpenTelemetry allows automatic instrumentation of FastAPI.

 1. Install the instrumentation library

-    ```console
+    ```bash
    pip install opentelemetry-instrumentation-fastapi
    ```

 1. Run vLLM with `opentelemetry-instrument`

-    ```console
+    ```bash
    opentelemetry-instrument vllm serve facebook/opt-125m
    ```


--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -11,6 +11,7 @@ Features:
 - Streaming response display
 - Configurable API endpoint
 - Real-time chat history
+- Reasoning Display: Optional thinking process visualization 

 Requirements:
    pip install streamlit openai
@@ -51,13 +52,33 @@ if "messages" not in st.session_state:
 if "active_session" not in st.session_state:
    st.session_state.active_session = None

+# Add new session state for reasoning
+if "show_reasoning" not in st.session_state:
+    st.session_state.show_reasoning = {}
+
 # Initialize session state for API base URL
 if "api_base_url" not in st.session_state:
    st.session_state.api_base_url = openai_api_base


 def create_new_chat_session():
-    """Create a new chat session with timestamp as ID"""
+    """Create a new chat session with timestamp as unique identifier.
+
+    This function initializes a new chat session by:
+    1. Generating a timestamp-based session ID
+    2. Creating an empty message list for the new session
+    3. Setting the new session as both current and active session
+    4. Resetting the messages list for the new session
+
+    Returns:
+        None
+
+    Session State Updates:
+        - sessions: Adds new empty message list with timestamp key
+        - current_session: Sets to new session ID
+        - active_session: Sets to new session ID
+        - messages: Resets to empty list
+    """
    session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    st.session_state.sessions[session_id] = []
    st.session_state.current_session = session_id
@@ -66,30 +87,98 @@ def create_new_chat_session():


 def switch_to_chat_session(session_id):
-    """Switch to a different chat session"""
+    """Switch the active chat context to a different session.
+
+    Args:
+        session_id (str): The timestamp ID of the session to switch to
+
+    This function handles chat session switching by:
+    1. Setting the specified session as current
+    2. Updating the active session marker
+    3. Loading the messages history from the specified session
+
+    Session State Updates:
+        - current_session: Updated to specified session_id
+        - active_session: Updated to specified session_id
+        - messages: Loaded from sessions[session_id]
+    """
    st.session_state.current_session = session_id
    st.session_state.active_session = session_id
    st.session_state.messages = st.session_state.sessions[session_id]


-def get_llm_response(messages, model):
-    """Get streaming response from llm
+def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
+    """Generate and stream LLM response with optional reasoning process.

    Args:
-        messages: List of message dictionaries
-        model: Name of model
+        messages (list): List of conversation message dicts with 'role' and 'content'
+        model (str): The model identifier to use for generation
+        reason (bool): Whether to enable and display reasoning process
+        content_ph (streamlit.empty): Placeholder for streaming response content
+        reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process

    Returns:
-        Streaming response object or error message string
+        tuple: (str, str)
+            - First string contains the complete response text
+            - Second string contains the complete reasoning text (if enabled)
+
+    Features:
+        - Streams both reasoning and response text in real-time
+        - Handles model API errors gracefully
+        - Supports live updating of thinking process
+        - Maintains separate content and reasoning displays
+
+    Raises:
+        Exception: Wrapped in error message if API call fails
+
+    Note:
+        The function uses streamlit placeholders for live updates.
+        When reason=True, the reasoning process appears above the response.
    """
+    full_text = ""
+    think_text = ""
+    live_think = None
+    # Build request parameters
+    params = {"model": model, "messages": messages, "stream": True}
+    if reason:
+        params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
+
    try:
-        response = client.chat.completions.create(
-            model=model, messages=messages, stream=True
-        )
-        return response
+        response = client.chat.completions.create(**params)
+        if isinstance(response, str):
+            if content_ph:
+                content_ph.markdown(response)
+            return response, ""
+
+        # Prepare reasoning expander above content
+        if reason and reasoning_ph:
+            exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
+            live_think = exp.empty()
+
+        # Stream chunks
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            # Stream reasoning first
+            if reason and hasattr(delta, "reasoning_content") and live_think:
+                rc = delta.reasoning_content
+                if rc:
+                    think_text += rc
+                    live_think.markdown(think_text + "▌")
+            # Then stream content
+            if hasattr(delta, "content") and delta.content and content_ph:
+                full_text += delta.content
+                content_ph.markdown(full_text + "▌")
+
+        # Finalize displays: reasoning remains above, content below
+        if reason and live_think:
+            live_think.markdown(think_text)
+        if content_ph:
+            content_ph.markdown(full_text)
+
+        return full_text, think_text
    except Exception as e:
        st.error(f"Error details: {str(e)}")
-        return f"Error: {str(e)}"
+        return f"Error: {str(e)}", ""


 # Sidebar - API Settings first
@@ -108,6 +197,7 @@ st.sidebar.title("Chat Sessions")
 if st.sidebar.button("New Session"):
    create_new_chat_session()

+
 # Display all sessions in reverse chronological order
 for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
    # Mark the active session with a pinned button
@@ -143,47 +233,79 @@ if st.session_state.current_session is None:
    create_new_chat_session()
    st.session_state.active_session = st.session_state.current_session

-# Display chat history for current session
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.write(message["content"])
+# Update the chat history display section
+for idx, msg in enumerate(st.session_state.messages):
+    # Render user messages normally
+    if msg["role"] == "user":
+        with st.chat_message("user"):
+            st.write(msg["content"])
+    # Render assistant messages with reasoning above
+    else:
+        # If reasoning exists for this assistant message, show it above the content
+        if idx in st.session_state.show_reasoning:
+            with st.expander("💭 Thinking Process", expanded=False):
+                st.markdown(st.session_state.show_reasoning[idx])
+        with st.chat_message("assistant"):
+            st.write(msg["content"])
+
+
+# Setup & Cache reasoning support check
+@st.cache_data(show_spinner=False)
+def server_supports_reasoning():
+    """Check if the current model supports reasoning capability.
+
+    Returns:
+        bool: True if the model supports reasoning, False otherwise
+    """
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": "Hi"}],
+        stream=False,
+    )
+    return hasattr(resp.choices[0].message, "reasoning_content") and bool(
+        resp.choices[0].message.reasoning_content
+    )

-# Handle user input and generate llm response
+
+# Check support
+supports_reasoning = server_supports_reasoning()
+
+# Add reasoning toggle in sidebar if supported
+reason = False  # Default to False
+if supports_reasoning:
+    reason = st.sidebar.checkbox("Enable Reasoning", value=False)
+else:
+    st.sidebar.markdown(
+        "<span style='color:gray;'>Reasoning unavailable for this model.</span>",
+        unsafe_allow_html=True,
+    )
+    # reason remains False
+
+# Update the input handling section
 if prompt := st.chat_input("Type your message here..."):
-    # Save user message to session
+    # Save and display user message
    st.session_state.messages.append({"role": "user", "content": prompt})
    st.session_state.sessions[st.session_state.current_session] = (
        st.session_state.messages
    )
-
-    # Display user message
    with st.chat_message("user"):
        st.write(prompt)

-    # Prepare messages for llm
-    messages_for_llm = [
+    # Prepare LLM messages
+    msgs = [
        {"role": m["role"], "content": m["content"]} for m in st.session_state.messages
    ]

-    # Generate and display llm response
+    # Stream assistant response
    with st.chat_message("assistant"):
-        message_placeholder = st.empty()
-        full_response = ""
-
-        # Get streaming response from llm
-        response = get_llm_response(messages_for_llm, model)
-        if isinstance(response, str):
-            message_placeholder.markdown(response)
-            full_response = response
-        else:
-            for chunk in response:
-                if hasattr(chunk.choices[0].delta, "content"):
-                    content = chunk.choices[0].delta.content
-                    if content:
-                        full_response += content
-                        message_placeholder.markdown(full_response + "▌")
-
-            message_placeholder.markdown(full_response)
-
-    # Save llm response to session history
-    st.session_state.messages.append({"role": "assistant", "content": full_response})
+        # Placeholders: reasoning above, content below
+        reason_ph = st.empty()
+        content_ph = st.empty()
+        full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
+        # Determine index for this new assistant message
+        message_index = len(st.session_state.messages)
+        # Save assistant reply
+        st.session_state.messages.append({"role": "assistant", "content": full})
+        # Persist reasoning in session state if any
+        if reason and think:
+            st.session_state.show_reasoning[message_index] = think
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
+# Structured Outputs
+
+This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
+It can run individual constraint type or all of them.
+It supports both streaming responses and concurrent non-streaming requests.
+
+To use this example, you must start an vLLM server with any model of your choice.
+
+```bash
+vllm serve Qwen/Qwen2.5-3B-Instruct
+```
+
+To serve a reasoning model, you can use the following command:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
+    --reasoning-parser deepseek_r1
+```
+
+If you want to run this script standalone with `uv`, you can use the following:
+
+```bash
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
+    structured-output
+```
+
+See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
+
+!!! tip
+    If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
+
+## Usage
+
+Run all constraints, non-streaming:
+
+```bash
+uv run structured_outputs.py
+```
+
+Run all constraints, streaming:
+
+```bash
+uv run structured_outputs.py --stream
+```
+
+Run certain constraints, for example `structural_tag` and `regex`, streaming:
+
+```bash
+uv run structured_outputs.py \
+    --constraint structural_tag regex \
+    --stream
+```
+
+Run all constraints, with reasoning models and streaming:
+
+```bash
+uv run structured_outputs.py --reasoning --stream
+```
--- a/examples/online_serving/structured_outputs/pyproject.toml
+++ b/examples/online_serving/structured_outputs/pyproject.toml
+[project]
+name = "examples-online-structured-outputs"
+requires-python = ">=3.9, <3.13"
+dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
+version = "0.0.0"
+
+[project.scripts]
+structured-outputs = "structured_outputs:main"
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
+# ruff: noqa: E501
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import enum
+import os
+from typing import TYPE_CHECKING, Any, Literal
+
+import openai
+import pydantic
+
+if TYPE_CHECKING:
+    from openai.types.chat import ChatCompletionChunk
+
+
+ConstraintsFormat = Literal[
+    "choice",
+    "regex",
+    "json",
+    "grammar",
+    "structural_tag",
+]
+
+
+async def print_stream_response(
+    stream_response: openai.AsyncStream[ChatCompletionChunk],
+    title: str,
+    args: argparse.Namespace,
+):
+    print(f"\n\n{title} (Streaming):")
+
+    local_reasoning_header_printed = False
+    local_content_header_printed = False
+
+    async for chunk in stream_response:
+        delta = chunk.choices[0].delta
+
+        reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
+        content_chunk_text = delta.content
+
+        if args.reasoning:
+            if reasoning_chunk_text:
+                if not local_reasoning_header_printed:
+                    print("  Reasoning: ", end="")
+                    local_reasoning_header_printed = True
+                print(reasoning_chunk_text, end="", flush=True)
+
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    if local_reasoning_header_printed:
+                        print()
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+        else:
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+    print()
+
+
+class CarType(str, enum.Enum):
+    SEDAN = "SEDAN"
+    SUV = "SUV"
+    TRUCK = "TRUCK"
+    COUPE = "COUPE"
+
+
+class CarDescription(pydantic.BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
+    "choice": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Classify this sentiment: vLLM is wonderful!",
+            }
+        ],
+        "extra_body": {"guided_choice": ["positive", "negative"]},
+    },
+    "regex": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
+            }
+        ],
+        "extra_body": {
+            "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
+        },
+    },
+    "json": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema(),
+            },
+        },
+    },
+    "grammar": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
+            }
+        ],
+        "extra_body": {
+            "guided_grammar": """
+root ::= select_statement
+
+select_statement ::= "SELECT " column " from " table " where " condition
+
+column ::= "col_1 " | "col_2 "
+
+table ::= "table_1 " | "table_2 "
+
+condition ::= column "= " number
+
+number ::= "1 " | "2 "
+""",
+        },
+    },
+    "structural_tag": {
+        "messages": [
+            {
+                "role": "user",
+                "content": """
+You have access to the following function to retrieve the weather in a city:
+
+{
+    "name": "get_weather",
+    "parameters": {
+        "city": {
+            "param_type": "string",
+            "description": "The city to get the weather for",
+            "required": True
+        }
+    }
+}
+
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+
+start_tag => `<function`
+parameters => a JSON dict with the function argument name as key and function
+              argument value as value.
+end_tag => `</function>`
+
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+
+You are a helpful assistant.
+
+Given the previous instructions, what is the weather in New York City, Boston,
+and San Francisco?""",
+            },
+        ],
+        "response_format": {
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_weather>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                        "required": ["city"],
+                    },
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
+        },
+    },
+}
+
+
+async def cli():
+    parser = argparse.ArgumentParser(
+        description="Run OpenAI Chat Completion with various structured outputs capabilities",
+    )
+    _ = parser.add_argument(
+        "--constraint",
+        type=str,
+        nargs="+",
+        choices=[*list(PARAMS), "*"],
+        default=["*"],
+        help="Specify which constraint(s) to run.",
+    )
+    _ = parser.add_argument(
+        "--stream",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable streaming output",
+    )
+    _ = parser.add_argument(
+        "--reasoning",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable printing of reasoning traces if available.",
+    )
+    args = parser.parse_args()
+
+    base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
+    client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
+    constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
+    model = (await client.models.list()).data[0].id
+
+    if args.stream:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=True,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, stream in zip(constraints, results):
+            await print_stream_response(stream, constraint, args)
+    else:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=False,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, response in zip(constraints, results):
+            print(f"\n\n{constraint}:")
+            message = response.choices[0].message
+            if args.reasoning and hasattr(message, "reasoning_content"):
+                print(f"  Reasoning: {message.reasoning_content or ''}")
+            print(f"  Content: {message.content!r}")
+
+
+def main():
+    asyncio.run(cli())
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -17,7 +17,8 @@ Usage:
            (Without enable_chunked_prefill)

 Note that `lmcache` is needed to run this example.
-Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
+Requirements:
+https://docs.lmcache.ai/getting_started/installation.html#prerequisites
 Learn more about LMCache environment setup, please refer to:
 https://docs.lmcache.ai/getting_started/installation.html
 """
@@ -28,8 +29,8 @@ import os
 import time
 from dataclasses import asdict

-from lmcache.experimental.cache_engine import LMCacheEngineBuilder
 from lmcache.integration.vllm.utils import ENGINE_NAME
+from lmcache.v1.cache_engine import LMCacheEngineBuilder

 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig

--- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
+++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -17,8 +17,8 @@ import subprocess
 import time
 from multiprocessing import Event, Process

-from lmcache.experimental.cache_engine import LMCacheEngineBuilder
 from lmcache.integration.vllm.utils import ENGINE_NAME
+from lmcache.v1.cache_engine import LMCacheEngineBuilder

 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
@@ -105,7 +105,7 @@ def run_retrieve(store_done, prompts, timeout=1):

 def run_lmcache_server(port):
    server_proc = subprocess.Popen(
-        ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
+        ["python", "-m", "lmcache.v1.server", "localhost", str(port)]
    )
    return server_proc


--- a/examples/others/logging_configuration.md
+++ b/examples/others/logging_configuration.md
@@ -55,33 +55,33 @@ STDOUT of the console in JSON format with a log level of `INFO`.

 To begin, first, create an appropriate JSON logging configuration file:

-**/path/to/logging_config.json:**
-
-```json
-{
-  "formatters": {
-    "json": {
-      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
-    }
-  },
-  "handlers": {
-    "console": {
-      "class" : "logging.StreamHandler",
-      "formatter": "json",
-      "level": "INFO",
-      "stream": "ext://sys.stdout"
+??? note "/path/to/logging_config.json"
+
+    ```json
+    {
+      "formatters": {
+        "json": {
+          "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
+        }
+      },
+      "handlers": {
+        "console": {
+          "class" : "logging.StreamHandler",
+          "formatter": "json",
+          "level": "INFO",
+          "stream": "ext://sys.stdout"
+        }
+      },
+      "loggers": {
+        "vllm": {
+          "handlers": ["console"],
+          "level": "INFO",
+          "propagate": false
+        }
+      },
+      "version": 1
    }
-  },
-  "loggers": {
-    "vllm": {
-      "handlers": ["console"],
-      "level": "INFO",
-      "propagate": false
-    }
-  },
-  "version": 1
-}
-```
+    ```

 Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
 to the path of the custom logging configuration JSON file:
@@ -104,38 +104,38 @@ configuration overrides the built-in default logging configuration used by vLLM.
 First, create an appropriate JSON logging configuration file that includes
 configuration for the root vLLM logger and for the logger you wish to silence:

-**/path/to/logging_config.json:**
-
-```json
-{
-  "formatters": {
-    "vllm": {
-      "class": "vllm.logging_utils.NewLineFormatter",
-      "datefmt": "%m-%d %H:%M:%S",
-      "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
-    }
-  },
-  "handlers": {
-    "vllm": {
-      "class" : "logging.StreamHandler",
-      "formatter": "vllm",
-      "level": "INFO",
-      "stream": "ext://sys.stdout"
+??? note "/path/to/logging_config.json"
+
+    ```json
+    {
+      "formatters": {
+        "vllm": {
+          "class": "vllm.logging_utils.NewLineFormatter",
+          "datefmt": "%m-%d %H:%M:%S",
+          "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
+        }
+      },
+      "handlers": {
+        "vllm": {
+          "class" : "logging.StreamHandler",
+          "formatter": "vllm",
+          "level": "INFO",
+          "stream": "ext://sys.stdout"
+        }
+      },
+      "loggers": {
+        "vllm": {
+          "handlers": ["vllm"],
+          "level": "DEBUG",
+          "propagate": false
+        },
+        "vllm.example_noisy_logger": {
+          "propagate": false
+        }
+      },
+      "version": 1
    }
-  },
-  "loggers": {
-    "vllm": {
-      "handlers": ["vllm"],
-      "level": "DEBUG",
-      "propagate": false
-    },
-    "vllm.example_noisy_logger": {
-      "propagate": false
-    }
-  },
-  "version": 1
-}
-```
+    ```

 Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
 to the path of the custom logging configuration JSON file: