Commit 99324e25 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.2' into v0.9.2-ori

parents cc7f22a8 a5dd03c1
......@@ -94,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
engine_args = EngineArgs(
model="TIGER-Lab/VLM2Vec-Full",
task="embed",
max_model_len=4096,
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1},
......
......@@ -289,6 +289,106 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs!
model_name = "llava-hf/llava-1.5-7b-hf"
engine_args = EngineArgs(
model=model_name,
max_num_seqs=16,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=16,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
engine_args = EngineArgs(
model=model_name,
max_model_len=16384,
max_num_seqs=16,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
......@@ -323,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
},
]
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
......@@ -368,6 +505,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
max_num_seqs=2,
tensor_parallel_size=2,
limit_mm_per_prompt={"image": len(image_urls)},
ignore_patterns=["consolidated.safetensors"],
)
placeholders = "[IMG]" * len(image_urls)
......@@ -728,6 +866,32 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=32768,
limit_mm_per_prompt={"image": len(image_urls)},
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
)
prompt = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
f"<|vision_end|>{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
model_example_map = {
"aria": load_aria,
"aya_vision": load_aya_vision,
......@@ -736,7 +900,11 @@ model_example_map = {
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
"keye_vl": load_keye_vl,
"kimi_vl": load_kimi_vl,
"llava": load_llava,
"llava-next": load_llava_next,
"llava-onevision": load_llava_onevision,
"llama4": load_llama4,
"mistral3": load_mistral3,
"mllama": load_mllama,
......@@ -750,6 +918,7 @@ model_example_map = {
"qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm,
"tarsier": load_tarsier,
"tarsier2": load_tarsier2,
}
......
#!/bin/bash
# =============================================================================
# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
# =============================================================================
# This script demonstrates disaggregated prefill and decode serving using
# P2P NCCL communication. The architecture supports various XpYd configurations:
#
# - 1P3D: 1 Prefill server + 3 Decode servers (current default)
# - 3P1D: 3 Prefill servers + 1 Decode server
# - etc.
#
# Configuration can be customized via environment variables:
# MODEL: Model to serve
# PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
# DECODE_GPUS: Comma-separated GPU IDs for decode servers
# PREFILL_PORTS: Comma-separated ports for prefill servers
# DECODE_PORTS: Comma-separated ports for decode servers
# PROXY_PORT: Proxy server port used to setup XpYd connection.
# TIMEOUT_SECONDS: Server startup timeout
# =============================================================================
# Configuration - can be overridden via environment variables
MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
PROXY_PORT=${PROXY_PORT:-30001}
# Default 1P3D configuration (1 Prefill + 3 Decode)
PREFILL_GPUS=${PREFILL_GPUS:-0}
DECODE_GPUS=${DECODE_GPUS:-1,2,3}
PREFILL_PORTS=${PREFILL_PORTS:-20003}
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
echo ""
echo "Architecture Configuration:"
echo " Model: $MODEL"
echo " Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"
echo " Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
echo " Proxy Port: $PROXY_PORT"
echo " Timeout: ${TIMEOUT_SECONDS}s"
echo ""
PIDS=()
# Switch to the directory of the current script
cd "$(dirname "${BASH_SOURCE[0]}")"
check_required_files() {
local files=("disagg_proxy_p2p_nccl_xpyd.py")
for file in "${files[@]}"; do
if [[ ! -f "$file" ]]; then
echo "Required file $file not found in $(pwd)"
exit 1
fi
done
}
check_hf_token() {
if [ -z "$HF_TOKEN" ]; then
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
echo "Example: export HF_TOKEN=your_token_here"
exit 1
fi
if [[ "$HF_TOKEN" != hf_* ]]; then
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
exit 1
fi
echo "HF_TOKEN is set and valid."
}
check_num_gpus() {
# Check if the number of GPUs are >=2 via nvidia-smi
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
if [ "$num_gpus" -lt 2 ]; then
echo "You need at least 2 GPUs to run disaggregated prefill."
exit 1
else
echo "Found $num_gpus GPUs."
fi
}
ensure_python_library_installed() {
echo "Checking if $1 is installed..."
if ! python3 -c "import $1" > /dev/null 2>&1; then
echo "$1 is not installed. Please install it via pip install $1."
exit 1
else
echo "$1 is installed."
fi
}
cleanup() {
echo "Stopping everything…"
trap - INT TERM # prevent re-entrancy
kill -- -$$ # negative PID == "this whole process-group"
wait # reap children so we don't leave zombies
exit 0
}
wait_for_server() {
local port=$1
local timeout_seconds=$TIMEOUT_SECONDS
local start_time=$(date +%s)
echo "Waiting for server on port $port..."
while true; do
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
echo "Server on port $port is ready."
return 0
fi
local now=$(date +%s)
if (( now - start_time >= timeout_seconds )); then
echo "Timeout waiting for server on port $port"
return 1
fi
sleep 1
done
}
main() {
check_required_files
check_hf_token
check_num_gpus
ensure_python_library_installed pandas
ensure_python_library_installed datasets
ensure_python_library_installed vllm
ensure_python_library_installed quart
trap cleanup INT
trap cleanup USR1
trap cleanup TERM
echo "Launching disaggregated serving components..."
echo "Please check the log files for detailed output:"
echo " - prefill*.log: Prefill server logs"
echo " - decode*.log: Decode server logs"
echo " - proxy.log: Proxy server log"
# =============================================================================
# Launch Proxy Server
# =============================================================================
echo ""
echo "Starting proxy server on port $PROXY_PORT..."
python3 disagg_proxy_p2p_nccl_xpyd.py &
PIDS+=($!)
# Parse GPU and port arrays
IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
# =============================================================================
# Launch Prefill Servers (X Producers)
# =============================================================================
echo ""
echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
for i in "${!PREFILL_GPU_ARRAY[@]}"; do
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
local port=${PREFILL_PORT_ARRAY[$i]}
local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
--tensor-parallel-size 1 \
--seed 1024 \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
PIDS+=($!)
done
# =============================================================================
# Launch Decode Servers (Y Decoders)
# =============================================================================
echo ""
echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
for i in "${!DECODE_GPU_ARRAY[@]}"; do
local gpu_id=${DECODE_GPU_ARRAY[$i]}
local port=${DECODE_PORT_ARRAY[$i]}
local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
--tensor-parallel-size 1 \
--seed 1024 \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
PIDS+=($!)
done
# =============================================================================
# Wait for All Servers to Start
# =============================================================================
echo ""
echo "Waiting for all servers to start..."
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
if ! wait_for_server $port; then
echo "Failed to start server on port $port"
cleanup
exit 1
fi
done
echo ""
echo "All servers are up. Starting benchmark..."
# =============================================================================
# Run Benchmark
# =============================================================================
cd ../../../benchmarks/
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
--model $MODEL \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
echo "Benchmarking done. Cleaning up..."
cleanup
}
main
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import socket
import threading
import uuid
import aiohttp
import msgpack
import zmq
from quart import Quart, make_response, request
count = 0
prefill_instances: dict[str, str] = {} # http_address: zmq_address
decode_instances: dict[str, str] = {} # http_address: zmq_address
prefill_cv = threading.Condition()
decode_cv = threading.Condition()
def _listen_for_register(poller, router_socket):
while True:
socks = dict(poller.poll())
if router_socket in socks:
remote_address, message = router_socket.recv_multipart()
# data: {"type": "P", "http_address": "ip:port",
# "zmq_address": "ip:port"}
data = msgpack.loads(message)
if data["type"] == "P":
global prefill_instances
global prefill_cv
with prefill_cv:
prefill_instances[data["http_address"]] = data["zmq_address"]
elif data["type"] == "D":
global decode_instances
global decode_cv
with decode_cv:
decode_instances[data["http_address"]] = data["zmq_address"]
else:
print(
"Unexpected, Received message from %s, data: %s",
remote_address,
data,
)
def start_service_discovery(hostname, port):
if not hostname:
hostname = socket.gethostname()
if port == 0:
raise ValueError("Port cannot be 0")
context = zmq.Context()
router_socket = context.socket(zmq.ROUTER)
router_socket.bind(f"tcp://{hostname}:{port}")
poller = zmq.Poller()
poller.register(router_socket, zmq.POLLIN)
_listener_thread = threading.Thread(
target=_listen_for_register, args=[poller, router_socket], daemon=True
)
_listener_thread.start()
return _listener_thread
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
app = Quart(__name__)
def random_uuid() -> str:
return str(uuid.uuid4().hex)
async def forward_request(url, data, request_id):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"X-Request-Id": request_id,
}
async with session.post(url=url, json=data, headers=headers) as response:
if response.status == 200:
if True:
async for chunk_bytes in response.content.iter_chunked(1024):
yield chunk_bytes
else:
content = await response.read()
yield content
@app.route("/v1/completions", methods=["POST"])
async def handle_request():
try:
original_request_data = await request.get_json()
prefill_request = original_request_data.copy()
# change max_tokens = 1 to let it only do prefill
prefill_request["max_tokens"] = 1
global count
global prefill_instances
global prefill_cv
with prefill_cv:
prefill_list = list(prefill_instances.items())
prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
global decode_instances
global decode_cv
with decode_cv:
decode_list = list(decode_instances.items())
decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
print(
f"handle_request count: {count}, [HTTP:{prefill_addr}, "
f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
f"ZMQ:{decode_zmq_addr}]"
)
count += 1
request_id = (
f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
f"{decode_zmq_addr}_{random_uuid()}"
)
# finish prefill
async for _ in forward_request(
f"http://{prefill_addr}/v1/completions", prefill_request, request_id
):
continue
# return decode
generator = forward_request(
f"http://{decode_addr}/v1/completions", original_request_data, request_id
)
response = await make_response(generator)
response.timeout = None
return response
except Exception as e:
import sys
import traceback
exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server")
print(e)
print("".join(traceback.format_exception(*exc_info)))
if __name__ == "__main__":
t = start_service_discovery("0.0.0.0", 30001)
app.run(host="0.0.0.0", port=10001)
t.join()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from typing import Optional
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled for xLAM-2 models:
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
OR
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
"""
import json
import time
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "empty"
openai_api_base = "http://localhost:8000/v1"
# Define tool functions
def get_weather(location: str, unit: str):
return f"Weather in {location} is 22 degrees {unit}."
def calculate_expression(expression: str):
try:
result = eval(expression)
return f"The result of {expression} is {result}"
except Exception as e:
return f"Could not calculate {expression}: {e}"
def translate_text(text: str, target_language: str):
return f"Translation of '{text}' to {target_language}: [translated content]"
# Define tools
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g., 'San Francisco, CA'",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "calculate_expression",
"description": "Calculate a mathematical expression",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "Mathematical expression to evaluate, needs to be a valid python expression",
}
},
"required": ["expression"],
},
},
},
{
"type": "function",
"function": {
"name": "translate_text",
"description": "Translate text to another language",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string", "description": "Text to translate"},
"target_language": {
"type": "string",
"description": "Target language for translation",
},
},
"required": ["text", "target_language"],
},
},
},
]
# Map of function names to implementations
tool_functions = {
"get_weather": get_weather,
"calculate_expression": calculate_expression,
"translate_text": translate_text,
}
def process_response(response, tool_functions, original_query):
"""Process a non-streaming response with possible tool calls"""
print("\n--- Response Output ---")
# Check if the response has content
if response.choices[0].message.content:
print(f"Content: {response.choices[0].message.content}")
# Check if the response has tool calls
if response.choices[0].message.tool_calls:
print("--------------------------------")
print(f"Tool calls: {response.choices[0].message.tool_calls}")
print("--------------------------------")
# Collect all tool calls and results before making follow-up request
tool_results = []
assistant_message = {"role": "assistant"}
if response.choices[0].message.content:
assistant_message["content"] = response.choices[0].message.content
assistant_tool_calls = []
# Process each tool call
for tool_call in response.choices[0].message.tool_calls:
function_name = tool_call.function.name
function_args = tool_call.function.arguments
function_id = tool_call.id
print(f"Function called: {function_name}")
print(f"Arguments: {function_args}")
print(f"Function ID: {function_id}")
# Execute the function
try:
# Parse the JSON arguments
args = json.loads(function_args)
# Call the function with the arguments
function_result = tool_functions[function_name](**args)
print(f"\n--- Function Result ---\n{function_result}\n")
# Add tool call to assistant message
assistant_tool_calls.append(
{
"id": function_id,
"type": "function",
"function": {"name": function_name, "arguments": function_args},
}
)
# Add tool result to tool_results
tool_results.append(
{
"role": "tool",
"tool_call_id": function_id,
"content": function_result,
}
)
except Exception as e:
print(f"Error executing function: {e}")
# Add tool_calls to assistant message
assistant_message["tool_calls"] = assistant_tool_calls
# Create a follow-up message with all function results
follow_up_messages = [
{"role": "user", "content": original_query},
assistant_message,
]
# Add all tool results to the messages
follow_up_messages.extend(tool_results)
# Get completion with all tool results in a single follow-up
follow_up_response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=follow_up_messages,
stream=False,
)
print("\n--- Follow-up Response ---")
print(follow_up_response.choices[0].message.content)
print("--- End Follow-up ---\n")
print("--- End Response ---\n")
def run_test_case(query, test_name):
"""Run a single test case with the given query"""
print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
print(f"Query: '{query}'")
start_time = time.time()
# Create non-streaming chat completion request
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": query}],
tools=tools,
tool_choice="auto",
stream=False,
)
# Process the non-streaming response, passing the original query
process_response(response, tool_functions, query)
end_time = time.time()
print(f"Test completed in {end_time - start_time:.2f} seconds")
def main():
# Initialize OpenAI client
global client
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Run test cases
test_cases = [
("I want to know the weather in San Francisco", "Weather Information"),
("Calculate 25 * 17 + 31", "Math Calculation"),
("Translate 'Hello world' to Spanish", "Text Translation"),
("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
]
# Execute all test cases
for query, test_name in test_cases:
run_test_case(query, test_name)
time.sleep(1) # Small delay between tests
print("\nAll tests completed.")
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled for xLAM-2 models:
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
OR
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
This example demonstrates streaming tool calls with xLAM models.
"""
import json
import time
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "empty"
openai_api_base = "http://localhost:8000/v1"
# Define tool functions
def get_weather(location: str, unit: str):
return f"Weather in {location} is 22 degrees {unit}."
def calculate_expression(expression: str):
try:
result = eval(expression)
return f"The result of {expression} is {result}"
except Exception as e:
return f"Could not calculate {expression}: {e}"
def translate_text(text: str, target_language: str):
return f"Translation of '{text}' to {target_language}: [translated content]"
# Define tools
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g., 'San Francisco, CA'",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "calculate_expression",
"description": "Calculate a mathematical expression",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "Mathematical expression to evaluate, needs to be a valid Python expression",
}
},
"required": ["expression"],
},
},
},
{
"type": "function",
"function": {
"name": "translate_text",
"description": "Translate text to another language",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string", "description": "Text to translate"},
"target_language": {
"type": "string",
"description": "Target language for translation",
},
},
"required": ["text", "target_language"],
},
},
},
]
# Map of function names to implementations
tool_functions = {
"get_weather": get_weather,
"calculate_expression": calculate_expression,
"translate_text": translate_text,
}
def process_stream(response, tool_functions, original_query):
"""Process a streaming response with possible tool calls"""
# Track multiple tool calls
tool_calls = {} # Dictionary to store tool calls by ID
current_id = None
print("\n--- Stream Output ---")
for chunk in response:
# Handle tool calls in the stream
if chunk.choices[0].delta.tool_calls:
for tool_call_chunk in chunk.choices[0].delta.tool_calls:
# Get the tool call ID
if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
current_id = tool_call_chunk.id
if current_id not in tool_calls:
tool_calls[current_id] = {
"function_name": None,
"function_args": "",
"function_id": current_id,
}
# Extract function information as it comes in chunks
if (
hasattr(tool_call_chunk, "function")
and current_id
and current_id in tool_calls
):
if (
hasattr(tool_call_chunk.function, "name")
and tool_call_chunk.function.name
):
tool_calls[current_id]["function_name"] = (
tool_call_chunk.function.name
)
print(f"Function called: {tool_call_chunk.function.name}")
if (
hasattr(tool_call_chunk.function, "arguments")
and tool_call_chunk.function.arguments
):
tool_calls[current_id]["function_args"] += (
tool_call_chunk.function.arguments
)
print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
# Handle regular content in the stream
elif chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
print("\n--- End Stream ---\n")
# Execute each function call and build messages for follow-up
follow_up_messages = [{"role": "user", "content": original_query}]
for tool_id, tool_data in tool_calls.items():
function_name = tool_data["function_name"]
function_args = tool_data["function_args"]
function_id = tool_data["function_id"]
if function_name and function_args:
try:
# Parse the JSON arguments
args = json.loads(function_args)
# Call the function with the arguments
function_result = tool_functions[function_name](**args)
print(
f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
)
# Add the assistant message with tool call
follow_up_messages.append(
{
"role": "assistant",
"tool_calls": [
{
"id": function_id,
"type": "function",
"function": {
"name": function_name,
"arguments": function_args,
},
}
],
}
)
# Add the tool message with function result
follow_up_messages.append(
{
"role": "tool",
"tool_call_id": function_id,
"content": function_result,
}
)
except Exception as e:
print(f"Error executing function: {e}")
# Only send follow-up if we have results to process
if len(follow_up_messages) > 1:
# Create a follow-up message with all the function results
follow_up_response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=follow_up_messages,
stream=True,
)
print("\n--- Follow-up Response ---")
for chunk in follow_up_response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
print("\n--- End Follow-up ---\n")
def run_test_case(query, test_name):
"""Run a single test case with the given query"""
print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
print(f"Query: '{query}'")
start_time = time.time()
# Create streaming chat completion request
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": query}],
tools=tools,
tool_choice="auto",
stream=True,
)
# Process the streaming response
process_stream(response, tool_functions, query)
end_time = time.time()
print(f"Test completed in {end_time - start_time:.2f} seconds")
def main():
# Initialize OpenAI client
global client
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Run test cases
test_cases = [
("I want to know the weather in San Francisco", "Weather Information"),
("Calculate 25 * 17 + 31", "Math Calculation"),
("Translate 'Hello world' to Spanish", "Text Translation"),
("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
]
# Execute all test cases
for query, test_name in test_cases:
run_test_case(query, test_name)
time.sleep(1) # Small delay between tests
print("\nAll tests completed.")
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
To run this example, you need to start the vLLM server:
```bash
vllm serve Qwen/Qwen2.5-3B-Instruct
```
"""
from enum import Enum
from openai import BadRequestError, OpenAI
from pydantic import BaseModel
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
# Guided decoding by Choice (list of possible options)
def guided_choice_completion(client: OpenAI, model: str):
completion = client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
],
extra_body={"guided_choice": ["positive", "negative"]},
)
return completion.choices[0].message.content
# Guided decoding by Regex
def guided_regex_completion(client: OpenAI, model: str):
prompt = (
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n"
)
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
)
return completion.choices[0].message.content
# Guided decoding by JSON using Pydantic schema
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
truck = "Truck"
coupe = "Coupe"
class CarDescription(BaseModel):
brand: str
model: str
car_type: CarType
def guided_json_completion(client: OpenAI, model: str):
json_schema = CarDescription.model_json_schema()
prompt = (
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"guided_json": json_schema},
)
return completion.choices[0].message.content
# Guided decoding by Grammar
def guided_grammar_completion(client: OpenAI, model: str):
simplified_sql_grammar = """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
prompt = (
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"guided_grammar": simplified_sql_grammar},
)
return completion.choices[0].message.content
# Extra backend options
def extra_backend_options_completion(client: OpenAI, model: str):
prompt = (
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n"
)
try:
# The guided_decoding_disable_fallback option forces vLLM to use
# xgrammar, so when it fails you get a 400 with the reason why
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={
"guided_regex": r"\w+@\w+\.com\n",
"stop": ["\n"],
"guided_decoding_disable_fallback": True,
},
)
return completion.choices[0].message.content
except BadRequestError as e:
print("This error is expected:", e)
def main():
client: OpenAI = OpenAI(
base_url=openai_api_base,
api_key=openai_api_key,
)
model = client.models.list().data[0].id
print("Guided Choice Completion:")
print(guided_choice_completion(client, model))
print("\nGuided Regex Completion:")
print(guided_regex_completion(client, model))
print("\nGuided JSON Completion:")
print(guided_json_completion(client, model))
print("\nGuided Grammar Completion:")
print(guided_grammar_completion(client, model))
print("\nExtra Backend Options Completion:")
print(extra_backend_options_completion(client, model))
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from openai import OpenAI
# This example demonstrates the `structural_tag` response format.
# It can be used to specify a structured output format that occurs between
# specific tags in the response. This example shows how it could be used
# to enforce the format of a tool call response, but it could be used for
# any structured output within a subset of the response.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
def main():
client = OpenAI(
base_url=openai_api_base,
api_key=openai_api_key,
)
messages = [
{
"role": "user",
"content": """
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name as key and function
argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City, Boston,
and San Francisco?
""",
}
]
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=messages,
response_format={
"type": "structural_tag",
"structures": [
{
"begin": "<function=get_weather>",
"schema": {
"type": "object",
"properties": {"city": {"type": "string"}},
},
"end": "</function>",
}
],
"triggers": ["<function="],
},
)
print(response)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
An example shows how to generate structured outputs from reasoning models
like DeepSeekR1. The thinking process will not be guided by the JSON
schema provided by the user. Only the final output will be structured.
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models
using the OpenAI Python client library.
"""
from enum import Enum
from openai import OpenAI
from pydantic import BaseModel
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
def print_completion_details(completion):
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
# Guided decoding by Regex
def guided_regex_completion(client: OpenAI, model: str):
prompt = "What is the capital of France?"
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={
"guided_regex": "(Paris|London)",
},
)
print_completion_details(completion)
class People(BaseModel):
name: str
age: int
def guided_json_completion(client: OpenAI, model: str):
json_schema = People.model_json_schema()
prompt = "Generate a JSON with the name and age of one random person."
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"guided_json": json_schema},
)
print_completion_details(completion)
# Guided decoding by JSON using Pydantic schema
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
truck = "Truck"
coupe = "Coupe"
class CarDescription(BaseModel):
brand: str
model: str
car_type: CarType
def guided_car_json_completion(client: OpenAI, model: str):
json_schema = CarDescription.model_json_schema()
prompt = (
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"guided_json": json_schema},
)
print_completion_details(completion)
# Guided decoding by Grammar
def guided_grammar_completion(client: OpenAI, model: str):
simplified_sql_grammar = """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt = (
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": prompt,
}
],
extra_body={"guided_grammar": simplified_sql_grammar},
)
print_completion_details(completion)
def main():
client: OpenAI = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model: str = models.data[0].id
print("Guided Regex Completion:")
guided_regex_completion(client, model)
print("\nGuided JSON Completion (People):")
guided_json_completion(client, model)
print("\nGuided JSON Completion (CarDescription):")
guided_car_json_completion(client, model)
print("\nGuided Grammar Completion:")
guided_grammar_completion(client, model)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import json
"""
This script demonstrates how to use the vLLM API server to perform audio
transcription with the `openai/whisper-large-v3` model.
import httpx
from openai import OpenAI
Before running this script, you must start the vLLM server with the following command:
from vllm.assets.audio import AudioAsset
vllm serve openai/whisper-large-v3
Requirements:
- vLLM with audio support
- openai Python SDK
- httpx for streaming support
The script performs:
1. Synchronous transcription using OpenAI-compatible API.
2. Streaming transcription using raw HTTP request to the vLLM server.
"""
import asyncio
mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path()
winning_call = AudioAsset("winning_call").get_local_path()
from openai import AsyncOpenAI, OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
from vllm.assets.audio import AudioAsset
def sync_openai():
with open(str(mary_had_lamb), "rb") as f:
def sync_openai(audio_path: str, client: OpenAI):
"""
Perform synchronous transcription using OpenAI-compatible API.
"""
with open(audio_path, "rb") as f:
transcription = client.audio.transcriptions.create(
file=f,
model="openai/whisper-large-v3",
......@@ -37,38 +45,53 @@ def sync_openai():
print("transcription result:", transcription.text)
sync_openai()
# OpenAI Transcription API client does not support streaming.
async def stream_openai_response():
data = {
"language": "en",
"stream": True,
"model": "openai/whisper-large-v3",
}
url = openai_api_base + "/audio/transcriptions"
headers = {"Authorization": f"Bearer {openai_api_key}"}
print("transcription result:", end=" ")
async with httpx.AsyncClient() as client:
with open(str(winning_call), "rb") as f:
async with client.stream(
"POST", url, files={"file": f}, data=data, headers=headers
) as response:
async for line in response.aiter_lines():
# Each line is a JSON object prefixed with 'data: '
if line:
if line.startswith("data: "):
line = line[len("data: ") :]
# Last chunk, stream ends
if line.strip() == "[DONE]":
break
# Parse the JSON response
chunk = json.loads(line)
# Extract and print the content
content = chunk["choices"][0].get("delta", {}).get("content")
print(content, end="")
# Run the asynchronous function
asyncio.run(stream_openai_response())
async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
"""
Perform asynchronous transcription using OpenAI-compatible API.
"""
print("\ntranscription result:", end=" ")
with open(audio_path, "rb") as f:
transcription = await client.audio.transcriptions.create(
file=f,
model="openai/whisper-large-v3",
language="en",
response_format="json",
temperature=0.0,
# Additional sampling params not provided by OpenAI API.
extra_body=dict(
seed=420,
top_p=0.6,
),
stream=True,
)
async for chunk in transcription:
if chunk.choices:
content = chunk.choices[0].get("delta", {}).get("content")
print(content, end="", flush=True)
print() # Final newline after stream ends
def main():
mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
winning_call = str(AudioAsset("winning_call").get_local_path())
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
sync_openai(mary_had_lamb, client)
# Run the asynchronous function
client = AsyncOpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
asyncio.run(stream_openai_response(winning_call, client))
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import json
import httpx
from openai import OpenAI
from vllm.assets.audio import AudioAsset
def sync_openai(audio_path: str, client: OpenAI):
with open(audio_path, "rb") as f:
translation = client.audio.translations.create(
file=f,
model="openai/whisper-large-v3",
response_format="json",
temperature=0.0,
# Additional params not provided by OpenAI API.
extra_body=dict(
language="it",
seed=4419,
repetition_penalty=1.3,
),
)
print("translation result:", translation.text)
async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
data = {
"language": "it",
"stream": True,
"model": "openai/whisper-large-v3",
}
url = base_url + "/audio/translations"
headers = {"Authorization": f"Bearer {api_key}"}
print("translation result:", end=" ")
# OpenAI translation API client does not support streaming.
async with httpx.AsyncClient() as client:
with open(audio_path, "rb") as f:
async with client.stream(
"POST", url, files={"file": f}, data=data, headers=headers
) as response:
async for line in response.aiter_lines():
# Each line is a JSON object prefixed with 'data: '
if line:
if line.startswith("data: "):
line = line[len("data: ") :]
# Last chunk, stream ends
if line.strip() == "[DONE]":
break
# Parse the JSON response
chunk = json.loads(line)
# Extract and print the content
content = chunk["choices"][0].get("delta", {}).get("content")
print(content, end="")
def main():
foscolo = str(AudioAsset("azacinto_foscolo").get_local_path())
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
sync_openai(foscolo, client)
# Run the asynchronous function
asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
if __name__ == "__main__":
main()
......@@ -2,7 +2,7 @@
1. Install OpenTelemetry packages:
```console
```bash
pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
......@@ -12,7 +12,7 @@
1. Start Jaeger in a docker container:
```console
```bash
# From: https://www.jaegertracing.io/docs/1.57/getting-started/
docker run --rm --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
......@@ -31,14 +31,14 @@
1. In a new shell, export Jaeger IP:
```console
```bash
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
```
Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
```console
```bash
export OTEL_SERVICE_NAME="vllm-server"
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
......@@ -46,7 +46,7 @@
1. In a new shell, send requests with trace context from a dummy client
```console
```bash
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
......@@ -67,7 +67,7 @@
OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
```console
```bash
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
......@@ -79,13 +79,13 @@ OpenTelemetry allows automatic instrumentation of FastAPI.
1. Install the instrumentation library
```console
```bash
pip install opentelemetry-instrumentation-fastapi
```
1. Run vLLM with `opentelemetry-instrument`
```console
```bash
opentelemetry-instrument vllm serve facebook/opt-125m
```
......
......@@ -11,6 +11,7 @@ Features:
- Streaming response display
- Configurable API endpoint
- Real-time chat history
- Reasoning Display: Optional thinking process visualization
Requirements:
pip install streamlit openai
......@@ -51,13 +52,33 @@ if "messages" not in st.session_state:
if "active_session" not in st.session_state:
st.session_state.active_session = None
# Add new session state for reasoning
if "show_reasoning" not in st.session_state:
st.session_state.show_reasoning = {}
# Initialize session state for API base URL
if "api_base_url" not in st.session_state:
st.session_state.api_base_url = openai_api_base
def create_new_chat_session():
"""Create a new chat session with timestamp as ID"""
"""Create a new chat session with timestamp as unique identifier.
This function initializes a new chat session by:
1. Generating a timestamp-based session ID
2. Creating an empty message list for the new session
3. Setting the new session as both current and active session
4. Resetting the messages list for the new session
Returns:
None
Session State Updates:
- sessions: Adds new empty message list with timestamp key
- current_session: Sets to new session ID
- active_session: Sets to new session ID
- messages: Resets to empty list
"""
session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.session_state.sessions[session_id] = []
st.session_state.current_session = session_id
......@@ -66,30 +87,98 @@ def create_new_chat_session():
def switch_to_chat_session(session_id):
"""Switch to a different chat session"""
"""Switch the active chat context to a different session.
Args:
session_id (str): The timestamp ID of the session to switch to
This function handles chat session switching by:
1. Setting the specified session as current
2. Updating the active session marker
3. Loading the messages history from the specified session
Session State Updates:
- current_session: Updated to specified session_id
- active_session: Updated to specified session_id
- messages: Loaded from sessions[session_id]
"""
st.session_state.current_session = session_id
st.session_state.active_session = session_id
st.session_state.messages = st.session_state.sessions[session_id]
def get_llm_response(messages, model):
"""Get streaming response from llm
def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
"""Generate and stream LLM response with optional reasoning process.
Args:
messages: List of message dictionaries
model: Name of model
messages (list): List of conversation message dicts with 'role' and 'content'
model (str): The model identifier to use for generation
reason (bool): Whether to enable and display reasoning process
content_ph (streamlit.empty): Placeholder for streaming response content
reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
Returns:
Streaming response object or error message string
tuple: (str, str)
- First string contains the complete response text
- Second string contains the complete reasoning text (if enabled)
Features:
- Streams both reasoning and response text in real-time
- Handles model API errors gracefully
- Supports live updating of thinking process
- Maintains separate content and reasoning displays
Raises:
Exception: Wrapped in error message if API call fails
Note:
The function uses streamlit placeholders for live updates.
When reason=True, the reasoning process appears above the response.
"""
full_text = ""
think_text = ""
live_think = None
# Build request parameters
params = {"model": model, "messages": messages, "stream": True}
if reason:
params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
try:
response = client.chat.completions.create(
model=model, messages=messages, stream=True
)
return response
response = client.chat.completions.create(**params)
if isinstance(response, str):
if content_ph:
content_ph.markdown(response)
return response, ""
# Prepare reasoning expander above content
if reason and reasoning_ph:
exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
live_think = exp.empty()
# Stream chunks
for chunk in response:
delta = chunk.choices[0].delta
# Stream reasoning first
if reason and hasattr(delta, "reasoning_content") and live_think:
rc = delta.reasoning_content
if rc:
think_text += rc
live_think.markdown(think_text + "▌")
# Then stream content
if hasattr(delta, "content") and delta.content and content_ph:
full_text += delta.content
content_ph.markdown(full_text + "▌")
# Finalize displays: reasoning remains above, content below
if reason and live_think:
live_think.markdown(think_text)
if content_ph:
content_ph.markdown(full_text)
return full_text, think_text
except Exception as e:
st.error(f"Error details: {str(e)}")
return f"Error: {str(e)}"
return f"Error: {str(e)}", ""
# Sidebar - API Settings first
......@@ -108,6 +197,7 @@ st.sidebar.title("Chat Sessions")
if st.sidebar.button("New Session"):
create_new_chat_session()
# Display all sessions in reverse chronological order
for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
# Mark the active session with a pinned button
......@@ -143,47 +233,79 @@ if st.session_state.current_session is None:
create_new_chat_session()
st.session_state.active_session = st.session_state.current_session
# Display chat history for current session
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.write(message["content"])
# Update the chat history display section
for idx, msg in enumerate(st.session_state.messages):
# Render user messages normally
if msg["role"] == "user":
with st.chat_message("user"):
st.write(msg["content"])
# Render assistant messages with reasoning above
else:
# If reasoning exists for this assistant message, show it above the content
if idx in st.session_state.show_reasoning:
with st.expander("💭 Thinking Process", expanded=False):
st.markdown(st.session_state.show_reasoning[idx])
with st.chat_message("assistant"):
st.write(msg["content"])
# Setup & Cache reasoning support check
@st.cache_data(show_spinner=False)
def server_supports_reasoning():
"""Check if the current model supports reasoning capability.
Returns:
bool: True if the model supports reasoning, False otherwise
"""
resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": "Hi"}],
stream=False,
)
return hasattr(resp.choices[0].message, "reasoning_content") and bool(
resp.choices[0].message.reasoning_content
)
# Handle user input and generate llm response
# Check support
supports_reasoning = server_supports_reasoning()
# Add reasoning toggle in sidebar if supported
reason = False # Default to False
if supports_reasoning:
reason = st.sidebar.checkbox("Enable Reasoning", value=False)
else:
st.sidebar.markdown(
"<span style='color:gray;'>Reasoning unavailable for this model.</span>",
unsafe_allow_html=True,
)
# reason remains False
# Update the input handling section
if prompt := st.chat_input("Type your message here..."):
# Save user message to session
# Save and display user message
st.session_state.messages.append({"role": "user", "content": prompt})
st.session_state.sessions[st.session_state.current_session] = (
st.session_state.messages
)
# Display user message
with st.chat_message("user"):
st.write(prompt)
# Prepare messages for llm
messages_for_llm = [
# Prepare LLM messages
msgs = [
{"role": m["role"], "content": m["content"]} for m in st.session_state.messages
]
# Generate and display llm response
# Stream assistant response
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = ""
# Get streaming response from llm
response = get_llm_response(messages_for_llm, model)
if isinstance(response, str):
message_placeholder.markdown(response)
full_response = response
else:
for chunk in response:
if hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content
if content:
full_response += content
message_placeholder.markdown(full_response + "▌")
message_placeholder.markdown(full_response)
# Save llm response to session history
st.session_state.messages.append({"role": "assistant", "content": full_response})
# Placeholders: reasoning above, content below
reason_ph = st.empty()
content_ph = st.empty()
full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
# Determine index for this new assistant message
message_index = len(st.session_state.messages)
# Save assistant reply
st.session_state.messages.append({"role": "assistant", "content": full})
# Persist reasoning in session state if any
if reason and think:
st.session_state.show_reasoning[message_index] = think
# Structured Outputs
This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
It can run individual constraint type or all of them.
It supports both streaming responses and concurrent non-streaming requests.
To use this example, you must start an vLLM server with any model of your choice.
```bash
vllm serve Qwen/Qwen2.5-3B-Instruct
```
To serve a reasoning model, you can use the following command:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
--reasoning-parser deepseek_r1
```
If you want to run this script standalone with `uv`, you can use the following:
```bash
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
structured-output
```
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
!!! tip
If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
## Usage
Run all constraints, non-streaming:
```bash
uv run structured_outputs.py
```
Run all constraints, streaming:
```bash
uv run structured_outputs.py --stream
```
Run certain constraints, for example `structural_tag` and `regex`, streaming:
```bash
uv run structured_outputs.py \
--constraint structural_tag regex \
--stream
```
Run all constraints, with reasoning models and streaming:
```bash
uv run structured_outputs.py --reasoning --stream
```
[project]
name = "examples-online-structured-outputs"
requires-python = ">=3.9, <3.13"
dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
version = "0.0.0"
[project.scripts]
structured-outputs = "structured_outputs:main"
# ruff: noqa: E501
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import argparse
import asyncio
import enum
import os
from typing import TYPE_CHECKING, Any, Literal
import openai
import pydantic
if TYPE_CHECKING:
from openai.types.chat import ChatCompletionChunk
ConstraintsFormat = Literal[
"choice",
"regex",
"json",
"grammar",
"structural_tag",
]
async def print_stream_response(
stream_response: openai.AsyncStream[ChatCompletionChunk],
title: str,
args: argparse.Namespace,
):
print(f"\n\n{title} (Streaming):")
local_reasoning_header_printed = False
local_content_header_printed = False
async for chunk in stream_response:
delta = chunk.choices[0].delta
reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
content_chunk_text = delta.content
if args.reasoning:
if reasoning_chunk_text:
if not local_reasoning_header_printed:
print(" Reasoning: ", end="")
local_reasoning_header_printed = True
print(reasoning_chunk_text, end="", flush=True)
if content_chunk_text:
if not local_content_header_printed:
if local_reasoning_header_printed:
print()
print(" Content: ", end="")
local_content_header_printed = True
print(content_chunk_text, end="", flush=True)
else:
if content_chunk_text:
if not local_content_header_printed:
print(" Content: ", end="")
local_content_header_printed = True
print(content_chunk_text, end="", flush=True)
print()
class CarType(str, enum.Enum):
SEDAN = "SEDAN"
SUV = "SUV"
TRUCK = "TRUCK"
COUPE = "COUPE"
class CarDescription(pydantic.BaseModel):
brand: str
model: str
car_type: CarType
PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
"choice": {
"messages": [
{
"role": "user",
"content": "Classify this sentiment: vLLM is wonderful!",
}
],
"extra_body": {"guided_choice": ["positive", "negative"]},
},
"regex": {
"messages": [
{
"role": "user",
"content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
}
],
"extra_body": {
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
},
},
"json": {
"messages": [
{
"role": "user",
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "car-description",
"schema": CarDescription.model_json_schema(),
},
},
},
"grammar": {
"messages": [
{
"role": "user",
"content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
}
],
"extra_body": {
"guided_grammar": """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
""",
},
},
"structural_tag": {
"messages": [
{
"role": "user",
"content": """
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name as key and function
argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City, Boston,
and San Francisco?""",
},
],
"response_format": {
"type": "structural_tag",
"structures": [
{
"begin": "<function=get_weather>",
"schema": {
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
"end": "</function>",
}
],
"triggers": ["<function="],
},
},
}
async def cli():
parser = argparse.ArgumentParser(
description="Run OpenAI Chat Completion with various structured outputs capabilities",
)
_ = parser.add_argument(
"--constraint",
type=str,
nargs="+",
choices=[*list(PARAMS), "*"],
default=["*"],
help="Specify which constraint(s) to run.",
)
_ = parser.add_argument(
"--stream",
action=argparse.BooleanOptionalAction,
default=False,
help="Enable streaming output",
)
_ = parser.add_argument(
"--reasoning",
action=argparse.BooleanOptionalAction,
default=False,
help="Enable printing of reasoning traces if available.",
)
args = parser.parse_args()
base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
model = (await client.models.list()).data[0].id
if args.stream:
results = await asyncio.gather(
*[
client.chat.completions.create(
model=model,
max_tokens=1024,
stream=True,
**PARAMS[name],
)
for name in constraints
]
)
for constraint, stream in zip(constraints, results):
await print_stream_response(stream, constraint, args)
else:
results = await asyncio.gather(
*[
client.chat.completions.create(
model=model,
max_tokens=1024,
stream=False,
**PARAMS[name],
)
for name in constraints
]
)
for constraint, response in zip(constraints, results):
print(f"\n\n{constraint}:")
message = response.choices[0].message
if args.reasoning and hasattr(message, "reasoning_content"):
print(f" Reasoning: {message.reasoning_content or ''}")
print(f" Content: {message.content!r}")
def main():
asyncio.run(cli())
if __name__ == "__main__":
main()
......@@ -17,7 +17,8 @@ Usage:
(Without enable_chunked_prefill)
Note that `lmcache` is needed to run this example.
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
Requirements:
https://docs.lmcache.ai/getting_started/installation.html#prerequisites
Learn more about LMCache environment setup, please refer to:
https://docs.lmcache.ai/getting_started/installation.html
"""
......@@ -28,8 +29,8 @@ import os
import time
from dataclasses import asdict
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
from lmcache.integration.vllm.utils import ENGINE_NAME
from lmcache.v1.cache_engine import LMCacheEngineBuilder
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
......
......@@ -17,8 +17,8 @@ import subprocess
import time
from multiprocessing import Event, Process
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
from lmcache.integration.vllm.utils import ENGINE_NAME
from lmcache.v1.cache_engine import LMCacheEngineBuilder
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
......@@ -105,7 +105,7 @@ def run_retrieve(store_done, prompts, timeout=1):
def run_lmcache_server(port):
server_proc = subprocess.Popen(
["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
["python", "-m", "lmcache.v1.server", "localhost", str(port)]
)
return server_proc
......
......@@ -55,33 +55,33 @@ STDOUT of the console in JSON format with a log level of `INFO`.
To begin, first, create an appropriate JSON logging configuration file:
**/path/to/logging_config.json:**
```json
{
"formatters": {
"json": {
"class": "pythonjsonlogger.jsonlogger.JsonFormatter"
}
},
"handlers": {
"console": {
"class" : "logging.StreamHandler",
"formatter": "json",
"level": "INFO",
"stream": "ext://sys.stdout"
??? note "/path/to/logging_config.json"
```json
{
"formatters": {
"json": {
"class": "pythonjsonlogger.jsonlogger.JsonFormatter"
}
},
"handlers": {
"console": {
"class" : "logging.StreamHandler",
"formatter": "json",
"level": "INFO",
"stream": "ext://sys.stdout"
}
},
"loggers": {
"vllm": {
"handlers": ["console"],
"level": "INFO",
"propagate": false
}
},
"version": 1
}
},
"loggers": {
"vllm": {
"handlers": ["console"],
"level": "INFO",
"propagate": false
}
},
"version": 1
}
```
```
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
to the path of the custom logging configuration JSON file:
......@@ -104,38 +104,38 @@ configuration overrides the built-in default logging configuration used by vLLM.
First, create an appropriate JSON logging configuration file that includes
configuration for the root vLLM logger and for the logger you wish to silence:
**/path/to/logging_config.json:**
```json
{
"formatters": {
"vllm": {
"class": "vllm.logging_utils.NewLineFormatter",
"datefmt": "%m-%d %H:%M:%S",
"format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
}
},
"handlers": {
"vllm": {
"class" : "logging.StreamHandler",
"formatter": "vllm",
"level": "INFO",
"stream": "ext://sys.stdout"
??? note "/path/to/logging_config.json"
```json
{
"formatters": {
"vllm": {
"class": "vllm.logging_utils.NewLineFormatter",
"datefmt": "%m-%d %H:%M:%S",
"format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
}
},
"handlers": {
"vllm": {
"class" : "logging.StreamHandler",
"formatter": "vllm",
"level": "INFO",
"stream": "ext://sys.stdout"
}
},
"loggers": {
"vllm": {
"handlers": ["vllm"],
"level": "DEBUG",
"propagate": false
},
"vllm.example_noisy_logger": {
"propagate": false
}
},
"version": 1
}
},
"loggers": {
"vllm": {
"handlers": ["vllm"],
"level": "DEBUG",
"propagate": false
},
"vllm.example_noisy_logger": {
"propagate": false
}
},
"version": 1
}
```
```
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
to the path of the custom logging configuration JSON file:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment