Merge tag 'v0.8.5' into v0.8.5-ori

081057de · zhuwenwen · 7cf5d5c4 · ba41cc90 · 081057de · 081057de
Commit 081057de authored Apr 29, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    )
+# Kimi-VL
+def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    prompts = [
+        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
+        f"<|media_pad|><|media_end|>{question}<|im_end|>"
+        "<|im_assistant|>assistant<|im_middle|>" for question in questions
+    ]
+    engine_args = EngineArgs(
+        model="moonshotai/Kimi-VL-A3B-Instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 # LLaVA-1.5
 def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -791,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=5120,
        max_num_seqs=2,
+        max_num_batched_tokens=12800,
        enable_lora=True,
        max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
        limit_mm_per_prompt={"image": 1},
    )
@@ -918,6 +944,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
    )
+# Qwen2.5-Omni
+def run_qwen2_5_omni(questions: list[str], modality: str):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": [1],
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+    if modality == "image":
+        placeholder = "<|IMAGE|>"
+    elif modality == "video":
+        placeholder = "<|VIDEO|>"
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech.")
+    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
+                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
+                f"{question}<|im_end|>\n"
+                "<|im_start|>assistant\n") for question in questions]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 # SkyworkR1V
 def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -966,6 +1028,7 @@ model_example_map = {
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
+    "kimi_vl": run_kimi_vl,
    "llava": run_llava,
    "llava-next": run_llava_next,
    "llava-next-video": run_llava_next_video,
@@ -986,6 +1049,7 @@ model_example_map = {
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
    "qwen2_5_vl": run_qwen2_5_vl,
+    "qwen2_5_omni": run_qwen2_5_omni,
    "skywork_chat": run_skyworkr1v,
    "smolvlm": run_smolvlm,
 }
@@ -1073,6 +1137,59 @@ def time_counter(enable: bool):
        yield
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=4,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['image', 'video'],
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+    return parser.parse_args()
 def main(args):
    model = args.model_type
    if model not in model_example_map:
@@ -1151,55 +1268,5 @@ def main(args):
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    args = parse_args()
-        description='Demo on using vLLM for offline inference with '
-        'vision language models for text generation')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="llava",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=4,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        choices=['image', 'video'],
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
-    parser.add_argument(
-        '--image-repeat-prob',
-        type=float,
-        default=None,
-        help='Simulates the hit-ratio for multi-modal preprocessor cache'
-        ' (if enabled)')
-    parser.add_argument(
-        '--disable-mm-preprocessor-cache',
-        action='store_true',
-        help='If True, disables caching of multi-modal preprocessor/mapper.')
-    parser.add_argument(
-        '--time-generate',
-        action='store_true',
-        help='If True, then print the total generate() call time')
-    parser.add_argument(
-        '--use-different-prompt-per-request',
-        action='store_true',
-        help='If True, then use different prompt (with the same multi-modal '
-        'data) for each request.')
-    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -156,16 +156,13 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
        print("-" * 50)
-def main(args: Namespace):
-    run_encode(args.model_name, args.modality, args.seed)
 model_example_map = {
    "e5_v": run_e5_v,
    "vlm2vec": run_vlm2vec,
 }
-if __name__ == "__main__":
+def parse_args():
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models for multimodal embedding')
@@ -184,6 +181,13 @@ if __name__ == "__main__":
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
+    return parser.parse_args()
-    args = parser.parse_args()
+def main(args: Namespace):
+    run_encode(args.model_name, args.modality, args.seed)
+if __name__ == "__main__":
+    args = parse_args()
    main(args)
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -326,6 +326,44 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    )
+def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=4,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
 def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@@ -465,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
-        max_model_len=10000,
+        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
    )
    placeholders = "".join(f"<|image_{i}|>"
@@ -640,6 +680,7 @@ model_example_map = {
    "h2ovl_chat": load_h2ovl,
    "idefics3": load_idefics3,
    "internvl_chat": load_internvl,
+    "kimi_vl": load_kimi_vl,
    "llama4": load_llama4,
    "mistral3": load_mistral3,
    "mllama": load_mllama,
@@ -727,22 +768,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
        print("-" * 50)
-def main(args: Namespace):
+def parse_args():
-    model = args.model_type
-    method = args.method
-    seed = args.seed
-    image_urls = IMAGE_URLS[:args.num_images]
-    if method == "generate":
-        run_generate(model, QUESTION, image_urls, seed)
-    elif method == "chat":
-        run_chat(model, QUESTION, image_urls, seed)
-    else:
-        raise ValueError(f"Invalid method: {method}")
-if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models that support multi-image input for text '
@@ -765,9 +791,29 @@ if __name__ == "__main__":
    parser.add_argument(
        "--num-images",
        "-n",
-        choices=list(range(1, 13)),  # 12 is the max number of images
+        type=int,
+        choices=list(range(1,
+                           len(IMAGE_URLS) + 1)),  # the max number of images
        default=2,
        help="Number of images to use for the demo.")
+    return parser.parse_args()
-    args = parser.parse_args()
+def main(args: Namespace):
+    model = args.model_type
+    method = args.method
+    seed = args.seed
+    image_urls = IMAGE_URLS[:args.num_images]
+    if method == "generate":
+        run_generate(model, QUESTION, image_urls, seed)
+    elif method == "chat":
+        run_chat(model, QUESTION, image_urls, seed)
+    else:
+        raise ValueError(f"Invalid method: {method}")
+if __name__ == "__main__":
+    args = parse_args()
    main(args)
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -58,6 +58,16 @@ def get_response(response: requests.Response) -> list[str]:
    return output
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--n", type=int, default=1)
+    parser.add_argument("--prompt", type=str, default="San Francisco is a")
+    parser.add_argument("--stream", action="store_true")
+    return parser.parse_args()
 def main(args: Namespace):
    prompt = args.prompt
    api_url = f"http://{args.host}:{args.port}/generate"
@@ -82,11 +92,5 @@ def main(args: Namespace):
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    args = parse_args()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--n", type=int, default=1)
-    parser.add_argument("--prompt", type=str, default="San Francisco is a")
-    parser.add_argument("--stream", action="store_true")
-    args = parser.parse_args()
    main(args)
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -2,32 +2,46 @@
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python
+Note that `pip install cohere` is needed to run this example.
 run: vllm serve BAAI/bge-reranker-base
 """
+from typing import Union
 import cohere
+from cohere import Client, ClientV2
+model = "BAAI/bge-reranker-base"
+query = "What is the capital of France?"
+documents = [
+    "The capital of France is Paris", "Reranking is fun!",
+    "vLLM is an open-source framework for fast AI serving"
+]
+def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
+                  documents: list[str]) -> dict:
+    return client.rerank(model=model, query=query, documents=documents)
+def main():
+    # cohere v1 client
+    cohere_v1 = cohere.Client(base_url="http://localhost:8000",
+                              api_key="sk-fake-key")
+    rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
+    print("-" * 50)
+    print("rerank_v1_result:\n", rerank_v1_result)
+    print("-" * 50)
+    # or the v2
+    cohere_v2 = cohere.ClientV2("sk-fake-key",
+                                base_url="http://localhost:8000")
+    rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
+    print("rerank_v2_result:\n", rerank_v2_result)
+    print("-" * 50)
-# cohere v1 client
+if __name__ == "__main__":
-co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
+    main()
-rerank_v1_result = co.rerank(
-    model="BAAI/bge-reranker-base",
-    query="What is the capital of France?",
-    documents=[
-        "The capital of France is Paris", "Reranking is fun!",
-        "vLLM is an open-source framework for fast AI serving"
-    ])
-print(rerank_v1_result)
-# or the v2
-co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
-v2_rerank_result = co2.rerank(
-    model="BAAI/bge-reranker-base",
-    query="What is the capital of France?",
-    documents=[
-        "The capital of France is Paris", "Reranking is fun!",
-        "vLLM is an open-source framework for fast AI serving"
-    ])
-print(v2_rerank_result)
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
 # SPDX-License-Identifier: Apache-2.0
+"""Example for starting a Gradio OpenAI Chatbot Webserver
+Start vLLM API server:
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+Start Gradio OpenAI Chatbot Webserver:
+    python examples/online_serving/gradio_openai_chatbot_webserver.py \
+                    -m meta-llama/Llama-2-7b-chat-hf
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
 import argparse
 import gradio as gr
 from openai import OpenAI
-# Argument parser setup
-parser = argparse.ArgumentParser(
+def format_history_to_openai(history):
-    description='Chatbot Interface with Customizable Parameters')
-parser.add_argument('--model-url',
-                    type=str,
-                    default='http://localhost:8000/v1',
-                    help='Model URL')
-parser.add_argument('-m',
-                    '--model',
-                    type=str,
-                    required=True,
-                    help='Model name for the chatbot')
-parser.add_argument('--temp',
-                    type=float,
-                    default=0.8,
-                    help='Temperature for text generation')
-parser.add_argument('--stop-token-ids',
-                    type=str,
-                    default='',
-                    help='Comma-separated stop token IDs')
-parser.add_argument("--host", type=str, default=None)
-parser.add_argument("--port", type=int, default=8001)
-# Parse the arguments
-args = parser.parse_args()
-# Set OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = args.model_url
-# Create an OpenAI client to interact with the API server
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-def predict(message, history):
-    # Convert chat history to OpenAI format
    history_openai_format = [{
        "role": "system",
-        "content": "You are a great ai assistant."
+        "content": "You are a great AI assistant."
    }]
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human})
@@ -54,31 +34,92 @@ def predict(message, history):
            "role": "assistant",
            "content": assistant
        })
+    return history_openai_format
+def predict(message, history, client, model_name, temp, stop_token_ids):
+    # Format history to OpenAI chat format
+    history_openai_format = format_history_to_openai(history)
    history_openai_format.append({"role": "user", "content": message})
-    # Create a chat completion request and send it to the API server
+    # Send request to OpenAI API (vLLM server)
    stream = client.chat.completions.create(
-        model=args.model,  # Model name to use
+        model=model_name,
-        messages=history_openai_format,  # Chat history
+        messages=history_openai_format,
-        temperature=args.temp,  # Temperature for text generation
+        temperature=temp,
-        stream=True,  # Stream response
+        stream=True,
        extra_body={
            'repetition_penalty':
            1,
-            'stop_token_ids': [
+            'stop_token_ids':
-                int(id.strip()) for id in args.stop_token_ids.split(',')
+            [int(id.strip())
-                if id.strip()
+             for id in stop_token_ids.split(',')] if stop_token_ids else []
-            ] if args.stop_token_ids else []
        })
-    # Read and return generated text from response stream
+    # Collect all chunks and concatenate them into a full message
-    partial_message = ""
+    full_message = ""
    for chunk in stream:
-        partial_message += (chunk.choices[0].delta.content or "")
+        full_message += (chunk.choices[0].delta.content or "")
-        yield partial_message
+    # Return the full message as a single response
+    return full_message
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Chatbot Interface with Customizable Parameters')
+    parser.add_argument('--model-url',
+                        type=str,
+                        default='http://localhost:8000/v1',
+                        help='Model URL')
+    parser.add_argument('-m',
+                        '--model',
+                        type=str,
+                        required=True,
+                        help='Model name for the chatbot')
+    parser.add_argument('--temp',
+                        type=float,
+                        default=0.8,
+                        help='Temperature for text generation')
+    parser.add_argument('--stop-token-ids',
+                        type=str,
+                        default='',
+                        help='Comma-separated stop token IDs')
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    return parser.parse_args()
+def build_gradio_interface(client, model_name, temp, stop_token_ids):
+    def chat_predict(message, history):
+        return predict(message, history, client, model_name, temp,
+                       stop_token_ids)
+    return gr.ChatInterface(fn=chat_predict,
+                            title="Chatbot Interface",
+                            description="A simple chatbot powered by vLLM")
+def main():
+    # Parse the arguments
+    args = parse_args()
+    # Set OpenAI's API key and API base to use vLLM's API server
+    openai_api_key = "EMPTY"
+    openai_api_base = args.model_url
+    # Create an OpenAI client
+    client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
+    # Define the Gradio chatbot interface using the predict function
+    gradio_interface = build_gradio_interface(client, args.model, args.temp,
+                                              args.stop_token_ids)
+    gradio_interface.queue().launch(server_name=args.host,
+                                    server_port=args.port,
+                                    share=True)
-# Create and launch a chat interface with Gradio
+if __name__ == "__main__":
-gr.ChatInterface(predict).queue().launch(server_name=args.host,
+    main()
-                                         server_port=args.port,
-                                         share=True)
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
 # SPDX-License-Identifier: Apache-2.0
+"""Example for starting a Gradio Webserver
+Start vLLM API server:
+    python -m vllm.entrypoints.api_server \
+        --model meta-llama/Llama-2-7b-chat-hf
+Start Webserver:
+    python examples/online_serving/gradio_webserver.py
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
 import argparse
 import json
@@ -39,16 +56,23 @@ def build_demo():
    return demo
-if __name__ == "__main__":
+def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default=None)
    parser.add_argument("--port", type=int, default=8001)
    parser.add_argument("--model-url",
                        type=str,
                        default="http://localhost:8000/generate")
-    args = parser.parse_args()
+    return parser.parse_args()
+def main(args):
    demo = build_demo()
    demo.queue().launch(server_name=args.host,
                        server_port=args.port,
                        share=True)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -23,12 +23,19 @@ data = {
        "The capital of France is Paris.", "Horses and cows are both animals"
    ]
 }
-response = requests.post(url, headers=headers, json=data)
-# Check the response
+def main():
-if response.status_code == 200:
+    response = requests.post(url, headers=headers, json=data)
-    print("Request successful!")
-    print(json.dumps(response.json(), indent=2))
+    # Check the response
-else:
+    if response.status_code == 200:
-    print(f"Request failed with status code: {response.status_code}")
+        print("Request successful!")
-    print(response.text)
+        print(json.dumps(response.json(), indent=2))
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
 # SPDX-License-Identifier: Apache-2.0
+"""Example Python client for OpenAI Chat Completion using vLLM API server
+NOTE: start a supported chat completion model server with `vllm serve`, e.g.
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+"""
 from openai import OpenAI
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
+messages = [{
-    # defaults to os.environ.get("OPENAI_API_KEY")
+    "role": "system",
-    api_key=openai_api_key,
+    "content": "You are a helpful assistant."
-    base_url=openai_api_base,
+}, {
-)
+    "role": "user",
+    "content": "Who won the world series in 2020?"
-models = client.models.list()
+}, {
-model = models.data[0].id
+    "role": "assistant",
+    "content": "The Los Angeles Dodgers won the World Series in 2020."
-chat_completion = client.chat.completions.create(
+}, {
-    messages=[{
+    "role": "user",
-        "role": "system",
+    "content": "Where was it played?"
-        "content": "You are a helpful assistant."
+}]
-    }, {
-        "role": "user",
-        "content": "Who won the world series in 2020?"
+def main():
-    }, {
+    client = OpenAI(
-        "role":
+        # defaults to os.environ.get("OPENAI_API_KEY")
-        "assistant",
+        api_key=openai_api_key,
-        "content":
+        base_url=openai_api_base,
-        "The Los Angeles Dodgers won the World Series in 2020."
+    )
-    }, {
-        "role": "user",
+    models = client.models.list()
-        "content": "Where was it played?"
+    model = models.data[0].id
-    }],
-    model=model,
+    chat_completion = client.chat.completions.create(
-)
+        messages=messages,
+        model=model,
-print("Chat completion results:")
+    )
-print(chat_completion)
+    print("-" * 50)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 50)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 (multi-image inference with Phi-3.5-vision-instruct)
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 (audio inference with Ultravox)
 vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
@@ -303,12 +303,7 @@ example_function_map = {
 }
-def main(args) -> None:
+def parse_args():
-    chat_type = args.chat_type
-    example_function_map[chat_type]()
-if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using OpenAI client for online serving with '
        'multimodal language models served with vLLM.')
@@ -318,5 +313,14 @@ if __name__ == "__main__":
                        default="single-image",
                        choices=list(example_function_map.keys()),
                        help='Conversation type with multimodal data.')
-    args = parser.parse_args()
+    return parser.parse_args()
+def main(args) -> None:
+    chat_type = args.chat_type
+    example_function_map[chat_type]()
+if __name__ == "__main__":
+    args = parse_args()
    main(args)
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
            --enable-auto-tool-choice --tool-call-parser hermes
 """
 import json
+from typing import Any
 from openai import OpenAI
@@ -24,15 +25,6 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-models = client.models.list()
-model = models.data[0].id
 tools = [{
    "type": "function",
    "function": {
@@ -78,86 +70,123 @@ messages = [{
    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
 }]
-chat_completion = client.chat.completions.create(messages=messages,
-                                                 model=model,
-                                                 tools=tools)
-print("Chat completion results:")
-print(chat_completion)
-print("\n\n")
-tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=True)
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
-    if chunk.choices[0].delta.tool_calls:
-        print(chunk.choices[0].delta.tool_calls[0])
-    else:
-        print(chunk.choices[0].delta)
-arguments = []
-tool_call_idx = -1
-for chunk in chunks:
-    if chunk.choices[0].delta.tool_calls:
-        tool_call = chunk.choices[0].delta.tool_calls[0]
-        if tool_call.index != tool_call_idx:
-            if tool_call_idx >= 0:
-                print(
-                    f"streamed tool call arguments: {arguments[tool_call_idx]}"
-                )
-            tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
-            arguments.append("")
-        if tool_call.id:
-            print(f"streamed tool call id: {tool_call.id} ")
-        if tool_call.function:
-            if tool_call.function.name:
-                print(f"streamed tool call name: {tool_call.function.name}")
-            if tool_call.function.arguments:
-                arguments[tool_call_idx] += tool_call.function.arguments
-if len(arguments):
-    print(f"streamed tool call arguments: {arguments[-1]}")
-print("\n\n")
-messages.append({
-    "role": "assistant",
-    "tool_calls": chat_completion.choices[0].message.tool_calls
-})
-# Now, simulate a tool call
 def get_current_weather(city: str, state: str, unit: 'str'):
    return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
            "partly cloudly, with highs in the 90's.")
-available_tools = {"get_current_weather": get_current_weather}
+def handle_tool_calls_stream(
+    client: OpenAI,
-completion_tool_calls = chat_completion.choices[0].message.tool_calls
+    messages: list[dict[str, str]],
-for call in completion_tool_calls:
+    model: str,
-    tool_to_call = available_tools[call.function.name]
+    tools: list[dict[str, Any]],
-    args = json.loads(call.function.arguments)
+) -> list[Any]:
-    result = tool_to_call(**args)
+    tool_calls_stream = client.chat.completions.create(messages=messages,
-    print(result)
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=True)
+    chunks = []
+    print("chunks: ")
+    for chunk in tool_calls_stream:
+        chunks.append(chunk)
+        if chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls[0])
+        else:
+            print(chunk.choices[0].delta)
+    return chunks
+def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
+    arguments = []
+    tool_call_idx = -1
+    print("arguments: ")
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                if tool_call_idx >= 0:
+                    print(f"streamed tool call arguments: "
+                          f"{arguments[tool_call_idx]}")
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+            if tool_call.id:
+                print(f"streamed tool call id: {tool_call.id} ")
+            if tool_call.function:
+                if tool_call.function.name:
+                    print(
+                        f"streamed tool call name: {tool_call.function.name}")
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+    return arguments
+def main():
+    # Initialize OpenAI client
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    # Get available models and select one
+    models = client.models.list()
+    model = models.data[0].id
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools)
+    print("-" * 70)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 70)
+    # Stream tool calls
+    chunks = handle_tool_calls_stream(client, messages, model, tools)
+    print("-" * 70)
+    # Handle arguments from streamed tool calls
+    arguments = handle_tool_calls_arguments(chunks)
+    if len(arguments):
+        print(f"streamed tool call arguments: {arguments[-1]}\n")
+    print("-" * 70)
+    # Add tool call results to the conversation
    messages.append({
-        "role": "tool",
+        "role": "assistant",
-        "content": result,
+        "tool_calls": chat_completion.choices[0].message.tool_calls
-        "tool_call_id": call.id,
-        "name": call.function.name
    })
-chat_completion_2 = client.chat.completions.create(messages=messages,
+    # Now, simulate a tool call
-                                                   model=model,
+    available_tools = {"get_current_weather": get_current_weather}
-                                                   tools=tools,
-                                                   stream=False)
+    completion_tool_calls = chat_completion.choices[0].message.tool_calls
-print("\n\n")
+    for call in completion_tool_calls:
-print(chat_completion_2)
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        print("tool_to_call result: ", result)
+        messages.append({
+            "role": "tool",
+            "content": result,
+            "tool_call_id": call.id,
+            "name": call.function.name
+        })
+    chat_completion_2 = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=False)
+    print("Chat completion2 results:")
+    print(chat_completion_2)
+    print("-" * 70)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
 # SPDX-License-Identifier: Apache-2.0
 """
-To run this example, you can start the vLLM server 
+To run this example, you can start the vLLM server
 without any specific flags:
 ```bash
@@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
    --guided-decoding-backend outlines
 ```
-This example demonstrates how to generate chat completions 
+This example demonstrates how to generate chat completions
 using the OpenAI Python client library.
 """
@@ -18,15 +18,6 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-models = client.models.list()
-model = models.data[0].id
 tools = [
    {
        "type": "function",
@@ -116,21 +107,36 @@ messages = [
    },
 ]
-chat_completion = client.chat.completions.create(
-    messages=messages,
-    model=model,
-    tools=tools,
-    tool_choice="required",
-    stream=True  # Enable streaming response
-)
-for chunk in chat_completion:
+def main():
-    if chunk.choices and chunk.choices[0].delta.tool_calls:
+    client = OpenAI(
-        print(chunk.choices[0].delta.tool_calls)
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model = models.data[0].id
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice="required",
+        stream=True  # Enable streaming response
+    )
+    for chunk in chat_completion:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls)
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools,
+                                                     tool_choice="required")
+    print(chat_completion.choices[0].message.tool_calls)
-chat_completion = client.chat.completions.create(messages=messages,
-                                                 model=model,
-                                                 tools=tools,
-                                                 tool_choice="required")
-print(chat_completion.choices[0].message.tool_calls)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
 # SPDX-License-Identifier: Apache-2.0
+"""
+To run this example, you need to start the vLLM server:
+```bash
+vllm serve Qwen/Qwen2.5-3B-Instruct
+```
+"""
 from enum import Enum
 from openai import BadRequestError, OpenAI
 from pydantic import BaseModel
-client = OpenAI(
-    base_url="http://localhost:8000/v1",
-    api_key="-",
-)
 # Guided decoding by Choice (list of possible options)
-completion = client.chat.completions.create(
+def guided_choice_completion(client: OpenAI, model: str):
-    model="Qwen/Qwen2.5-3B-Instruct",
+    completion = client.chat.completions.create(
-    messages=[{
+        model=model,
-        "role": "user",
+        messages=[{
-        "content": "Classify this sentiment: vLLM is wonderful!"
+            "role": "user",
-    }],
+            "content": "Classify this sentiment: vLLM is wonderful!"
-    extra_body={"guided_choice": ["positive", "negative"]},
+        }],
-)
+        extra_body={"guided_choice": ["positive", "negative"]},
-print(completion.choices[0].message.content)
+    )
+    return completion.choices[0].message.content
 # Guided decoding by Regex
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+def guided_regex_completion(client: OpenAI, model: str):
-          "End in .com and new line. Example result:"
+    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-          "alan.turing@enigma.com\n")
+              "End in .com and new line. Example result:"
+              "alan.turing@enigma.com\n")
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
+    completion = client.chat.completions.create(
-    messages=[{
+        model=model,
-        "role": "user",
+        messages=[{
-        "content": prompt,
+            "role": "user",
-    }],
+            "content": prompt,
-    extra_body={
+        }],
-        "guided_regex": "\w+@\w+\.com\n",
+        extra_body={
-        "stop": ["\n"]
+            "guided_regex": r"\w+@\w+\.com\n",
-    },
+            "stop": ["\n"]
-)
+        },
-print(completion.choices[0].message.content)
+    )
+    return completion.choices[0].message.content
 # Guided decoding by JSON using Pydantic schema
@@ -54,66 +60,100 @@ class CarDescription(BaseModel):
    car_type: CarType
-json_schema = CarDescription.model_json_schema()
+def guided_json_completion(client: OpenAI, model: str):
+    json_schema = CarDescription.model_json_schema()
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print(completion.choices[0].message.content)
-# Guided decoding by Grammar
+    prompt = ("Generate a JSON with the brand, model and car_type of"
-simplified_sql_grammar = """
+              "the most iconic car from the 90's")
-    ?start: select_statement
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_json": json_schema},
+    )
+    return completion.choices[0].message.content
-    ?select_statement: "SELECT " column_list " FROM " table_name
-    ?column_list: column_name ("," column_name)*
+# Guided decoding by Grammar
+def guided_grammar_completion(client: OpenAI, model: str):
+    simplified_sql_grammar = """
+        root ::= select_statement
-    ?table_name: identifier
+        select_statement ::= "SELECT " column " from " table " where " condition
-    ?column_name: identifier
+        column ::= "col_1 " | "col_2 "
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        table ::= "table_1 " | "table_2 "
-"""
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
+        condition ::= column "= " number
-          "from the 'users' table.")
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_grammar": simplified_sql_grammar},
-)
-print(completion.choices[0].message.content)
-# Extra backend options
+        number ::= "1 " | "2 "
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+    """
-          "End in .com and new line. Example result:"
-          "alan.turing@enigma.com\n")
-try:
+    prompt = ("Generate an SQL query to show the 'username' and 'email'"
-    # The no-fallback option forces vLLM to use xgrammar, so when it fails
+              "from the 'users' table.")
-    # you get a 400 with the reason why
    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
+        model=model,
        messages=[{
            "role": "user",
            "content": prompt,
        }],
-        extra_body={
+        extra_body={"guided_grammar": simplified_sql_grammar},
-            "guided_regex": "\w+@\w+\.com\n",
-            "stop": ["\n"],
-            "guided_decoding_backend": "xgrammar:no-fallback"
-        },
    )
-except BadRequestError as e:
+    return completion.choices[0].message.content
-    print("This error is expected:", e)
+# Extra backend options
+def extra_backend_options_completion(client: OpenAI, model: str):
+    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+              "End in .com and new line. Example result:"
+              "alan.turing@enigma.com\n")
+    try:
+        # The no-fallback option forces vLLM to use xgrammar, so when it fails
+        # you get a 400 with the reason why
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[{
+                "role": "user",
+                "content": prompt,
+            }],
+            extra_body={
+                "guided_regex": r"\w+@\w+\.com\n",
+                "stop": ["\n"],
+                "guided_decoding_backend": "xgrammar:no-fallback"
+            },
+        )
+        return completion.choices[0].message.content
+    except BadRequestError as e:
+        print("This error is expected:", e)
+def main():
+    client: OpenAI = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
+    )
+    model = "Qwen/Qwen2.5-3B-Instruct"
+    print("Guided Choice Completion:")
+    print(guided_choice_completion(client, model))
+    print("\nGuided Regex Completion:")
+    print(guided_regex_completion(client, model))
+    print("\nGuided JSON Completion:")
+    print(guided_json_completion(client, model))
+    print("\nGuided Grammar Completion:")
+    print(guided_grammar_completion(client, model))
+    print("\nExtra Backend Options Completion:")
+    print(extra_backend_options_completion(client, model))
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+# SPDX-License-Identifier: Apache-2.0
+from openai import OpenAI
+# This example demonstrates the `structural_tag` response format.
+# It can be used to specify a structured output format that occurs between
+# specific tags in the response. This example shows how it could be used
+# to enforce the format of a tool call response, but it could be used for
+# any structured output within a subset of the response.
+def main():
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
+    )
+    messages = [{
+        "role":
+        "user",
+        "content":
+        """
+You have access to the following function to retrieve the weather in a city:
+    {
+        "name": "get_weather",
+        "parameters": {
+            "city": {
+                "param_type": "string",
+                "description": "The city to get the weather for",
+                "required": True
+            }
+        }
+    }
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+start_tag => `<function`
+parameters => a JSON dict with the function argument name as key and function
+              argument value as value.
+end_tag => `</function>`
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+You are a helpful assistant.
+Given the previous instructions, what is the weather in New York City, Boston,
+and San Francisco?
+"""
+    }]
+    response = client.chat.completions.create(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        messages=messages,
+        response_format={
+            "type":
+            "structural_tag",
+            "structures": [{
+                "begin": "<function=get_weather>",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "end": "</function>"
+            }],
+            "triggers": ["<function="]
+        })
+    print(response)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -25,29 +25,28 @@ from pydantic import BaseModel
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-models = client.models.list()
+def print_completion_details(completion):
-model = models.data[0].id
+    print("reasoning_content: ",
+          completion.choices[0].message.reasoning_content)
+    print("content: ", completion.choices[0].message.content)
 # Guided decoding by Regex
-prompt = ("What is the capital of France?")
+def guided_regex_completion(client: OpenAI, model: str):
+    prompt = ("What is the capital of France?")
-completion = client.chat.completions.create(
-    model=model,
+    completion = client.chat.completions.create(
-    messages=[{
+        model=model,
-        "role": "user",
+        messages=[{
-        "content": prompt,
+            "role": "user",
-    }],
+            "content": prompt,
-    extra_body={
+        }],
-        "guided_regex": "(Paris|London)",
+        extra_body={
-    },
+            "guided_regex": "(Paris|London)",
-)
+        },
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+    )
-print("content: ", completion.choices[0].message.content)
+    print_completion_details(completion)
 class People(BaseModel):
@@ -55,19 +54,19 @@ class People(BaseModel):
    age: int
-json_schema = People.model_json_schema()
+def guided_json_completion(client: OpenAI, model: str):
+    json_schema = People.model_json_schema()
-prompt = ("Generate a JSON with the name and age of one random person.")
+    prompt = ("Generate a JSON with the name and age of one random person.")
-completion = client.chat.completions.create(
+    completion = client.chat.completions.create(
-    model=model,
+        model=model,
-    messages=[{
+        messages=[{
-        "role": "user",
+            "role": "user",
-        "content": prompt,
+            "content": prompt,
-    }],
+        }],
-    extra_body={"guided_json": json_schema},
+        extra_body={"guided_json": json_schema},
-)
+    )
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+    print_completion_details(completion)
-print("content: ", completion.choices[0].message.content)
 # Guided decoding by JSON using Pydantic schema
@@ -84,46 +83,73 @@ class CarDescription(BaseModel):
    car_type: CarType
-json_schema = CarDescription.model_json_schema()
+def guided_car_json_completion(client: OpenAI, model: str):
+    json_schema = CarDescription.model_json_schema()
+    prompt = ("Generate a JSON with the brand, model and car_type of"
+              "the most iconic car from the 90's")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_json": json_schema},
+    )
+    print_completion_details(completion)
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
 # Guided decoding by Grammar
-simplified_sql_grammar = """
+def guided_grammar_completion(client: OpenAI, model: str):
-    ?start: select_statement
+    simplified_sql_grammar = """
+        root ::= select_statement
-    ?select_statement: "SELECT " column_list " FROM " table_name
+        select_statement ::= "SELECT " column " from " table " where " condition
-    ?column_list: column_name ("," column_name)*
+        column ::= "col_1 " | "col_2 "
-    ?table_name: identifier
+        table ::= "table_1 " | "table_2 "
-    ?column_name: identifier
+        condition ::= column "= " number
+        number ::= "1 " | "2 "
+    """
+    # This may be very slow https://github.com/vllm-project/vllm/issues/12122
+    prompt = ("Generate an SQL query to show the 'username' and 'email'"
+              "from the 'users' table.")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    print_completion_details(completion)
+def main():
+    client: OpenAI = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model: str = models.data[0].id
+    print("Guided Regex Completion:")
+    guided_regex_completion(client, model)
+    print("\nGuided JSON Completion (People):")
+    guided_json_completion(client, model)
+    print("\nGuided JSON Completion (CarDescription):")
+    guided_car_json_completion(client, model)
+    print("\nGuided Grammar Completion:")
+    guided_grammar_completion(client, model)
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
-"""
-# This may be very slow https://github.com/vllm-project/vllm/issues/12122
+if __name__ == "__main__":
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
+    main()
-          "from the 'users' table.")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_grammar": simplified_sql_grammar},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather}
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-models = client.models.list()
-model = models.data[0].id
 tools = [{
    "type": "function",
    "function": {
@@ -109,69 +101,87 @@ def extract_reasoning_and_calls(chunks: list):
    return reasoning_content, arguments, function_names
-print("---------Full Generate With Automatic Function Calling-------------")
+def main():
-tool_calls = client.chat.completions.create(messages=messages,
+    client = OpenAI(
-                                            model=model,
+        api_key=openai_api_key,
-                                            tools=tools)
+        base_url=openai_api_base,
-print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    )
-print(f"function name: "
-      f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
+    models = client.models.list()
-print(f"function arguments: "
+    model = models.data[0].id
-      f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
+    print(
-print("----------Stream Generate With Automatic Function Calling-----------")
+        "---------Full Generate With Automatic Function Calling-------------")
-tool_calls_stream = client.chat.completions.create(messages=messages,
+    tool_calls = client.chat.completions.create(messages=messages,
-                                                   model=model,
+                                                model=model,
-                                                   tools=tools,
+                                                tools=tools)
-                                                   stream=True)
+    print(
-chunks = []
+        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
-for chunk in tool_calls_stream:
+    )
-    chunks.append(chunk)
+    print(f"function name: "
+          f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
-reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+    print(f"function arguments: "
-    chunks)
+          f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
-print(f"reasoning_content: {reasoning_content}")
+    print(
-print(f"function name: {function_names[0]}")
+        "----------Stream Generate With Automatic Function Calling-----------")
-print(f"function arguments: {arguments[0]}")
+    tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                       model=model,
-print("----------Full Generate With Named Function Calling-----------------")
+                                                       tools=tools,
-tool_calls = client.chat.completions.create(messages=messages,
+                                                       stream=True)
-                                            model=model,
-                                            tools=tools,
+    chunks = list(tool_calls_stream)
-                                            tool_choice={
-                                                "type": "function",
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-                                                "function": {
+        chunks)
-                                                    "name":
-                                                    "get_current_weather"
+    print(f"reasoning_content: {reasoning_content}")
-                                                }
+    print(f"function name: {function_names[0]}")
-                                            })
+    print(f"function arguments: {arguments[0]}")
-tool_call = tool_calls.choices[0].message.tool_calls[0].function
+    print(
-print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+        "----------Full Generate With Named Function Calling-----------------")
-print(f"function name: {tool_call.name}")
+    tool_calls = client.chat.completions.create(messages=messages,
-print(f"function arguments: {tool_call.arguments}")
+                                                model=model,
-print("----------Stream Generate With Named Function Calling--------------")
+                                                tools=tools,
+                                                tool_choice={
-tool_calls_stream = client.chat.completions.create(
+                                                    "type": "function",
-    messages=messages,
+                                                    "function": {
-    model=model,
+                                                        "name":
-    tools=tools,
+                                                        "get_current_weather"
-    tool_choice={
+                                                    }
-        "type": "function",
+                                                })
-        "function": {
-            "name": "get_current_weather"
+    tool_call = tool_calls.choices[0].message.tool_calls[0].function
-        }
+    print(
-    },
+        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
-    stream=True)
+    )
+    print(f"function name: {tool_call.name}")
-chunks = []
+    print(f"function arguments: {tool_call.arguments}")
-for chunk in tool_calls_stream:
+    print(
-    chunks.append(chunk)
+        "----------Stream Generate With Named Function Calling--------------")
-reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+    tool_calls_stream = client.chat.completions.create(
-    chunks)
+        messages=messages,
-print(f"reasoning_content: {reasoning_content}")
+        model=model,
-print(f"function name: {function_names[0]}")
+        tools=tools,
-print(f"function arguments: {arguments[0]}")
+        tool_choice={
-print("\n\n")
+            "type": "function",
+            "function": {
+                "name": "get_current_weather"
+            }
+        },
+        stream=True)
+    chunks = list(tool_calls_stream)
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+    print("\n\n")
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -3,8 +3,8 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
-To run this example, you need to start the vLLM server with the reasoning 
+To run this example, you need to start the vLLM server
-parser:
+with the reasoning parser:
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
@@ -21,35 +21,44 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-models = client.models.list()
+def main():
-model = models.data[0].id
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
-# Round 1
+    models = client.models.list()
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    model = models.data[0].id
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-response = client.chat.completions.create(model=model, messages=messages)
-reasoning_content = response.choices[0].message.reasoning_content
+    # Round 1
-content = response.choices[0].message.content
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # ruff: noqa: E501
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    response = client.chat.completions.create(model=model, messages=messages)
-print("reasoning_content for Round 1:", reasoning_content)
+    reasoning_content = response.choices[0].message.reasoning_content
-print("content for Round 1:", content)
+    content = response.choices[0].message.content
-# Round 2
+    print("reasoning_content for Round 1:", reasoning_content)
-messages.append({"role": "assistant", "content": content})
+    print("content for Round 1:", content)
-messages.append({
-    "role": "user",
-    "content": "How many Rs are there in the word 'strawberry'?",
-})
-response = client.chat.completions.create(model=model, messages=messages)
-reasoning_content = response.choices[0].message.reasoning_content
+    # Round 2
-content = response.choices[0].message.content
+    messages.append({"role": "assistant", "content": content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "How many Rs are there in the word 'strawberry'?",
+    })
+    response = client.chat.completions.create(model=model, messages=messages)
-print("reasoning_content for Round 2:", reasoning_content)
+    reasoning_content = response.choices[0].message.reasoning_content
-print("content for Round 2:", content)
+    content = response.choices[0].message.content
+    print("reasoning_content for Round 2:", reasoning_content)
+    print("content for Round 2:", content)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -3,7 +3,7 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
-To run this example, you need to start the vLLM server with the reasoning 
+To run this example, you need to start the vLLM server with the reasoning
 parser:
 ```bash
@@ -29,41 +29,49 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-models = client.models.list()
-model = models.data[0].id
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+def main():
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    client = OpenAI(
-stream = client.chat.completions.create(model=model,
+        api_key=openai_api_key,
-                                        messages=messages,
+        base_url=openai_api_base,
-                                        stream=True)
+    )
-print("client: Start streaming chat completions...")
+    models = client.models.list()
-printed_reasoning_content = False
+    model = models.data[0].id
-printed_content = False
+    # ruff: noqa: E501
-for chunk in stream:
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-    reasoning_content = None
+    stream = client.chat.completions.create(model=model,
-    content = None
+                                            messages=messages,
-    # Check the content is reasoning_content or content
+                                            stream=True)
-    if hasattr(chunk.choices[0].delta, "reasoning_content"):
-        reasoning_content = chunk.choices[0].delta.reasoning_content
+    print("client: Start streaming chat completions...")
-    elif hasattr(chunk.choices[0].delta, "content"):
+    printed_reasoning_content = False
-        content = chunk.choices[0].delta.content
+    printed_content = False
-    if reasoning_content is not None:
+    for chunk in stream:
-        if not printed_reasoning_content:
+        reasoning_content = None
-            printed_reasoning_content = True
+        content = None
-            print("reasoning_content:", end="", flush=True)
+        # Check the content is reasoning_content or content
-        print(reasoning_content, end="", flush=True)
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
-    elif content is not None:
+            reasoning_content = chunk.choices[0].delta.reasoning_content
-        if not printed_content:
+        elif hasattr(chunk.choices[0].delta, "content"):
-            printed_content = True
+            content = chunk.choices[0].delta.content
-            print("\ncontent:", end="", flush=True)
-        # Extract and print the content
+        if reasoning_content is not None:
-        print(content, end="", flush=True)
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
    print("Embedding output:", response_json["data"][0]["embedding"])
-if __name__ == '__main__':
+def parse_args():
    parser = argparse.ArgumentParser(
        "Script to call a specified VLM through the API. Make sure to serve "
        "the model with --task embed before running this.")
@@ -107,8 +107,10 @@ if __name__ == '__main__':
                        choices=["vlm2vec", "dse_qwen2_vl"],
                        required=True,
                        help="Which model to call.")
-    args = parser.parse_args()
+    return parser.parse_args()
+def main(args):
    if args.model == "vlm2vec":
        vlm2vec()
    elif args.model == "dse_qwen2_vl":
@@ -120,3 +122,8 @@ if __name__ == '__main__':
            "type": "text",
            "content": "What is the weather like today?",
        })
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -6,28 +6,36 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
+def main():
-    api_key=openai_api_key,
+    client = OpenAI(
-    base_url=openai_api_base,
+        # defaults to os.environ.get("OPENAI_API_KEY")
-)
+        api_key=openai_api_key,
+        base_url=openai_api_base,
-models = client.models.list()
+    )
-model = models.data[0].id
+    models = client.models.list()
-# Completion API
+    model = models.data[0].id
-stream = False
-completion = client.completions.create(
+    # Completion API
-    model=model,
+    stream = False
-    prompt="A robot may not injure a human being",
+    completion = client.completions.create(
-    echo=False,
+        model=model,
-    n=2,
+        prompt="A robot may not injure a human being",
-    stream=stream,
+        echo=False,
-    logprobs=3)
+        n=2,
+        stream=stream,
-print("Completion results:")
+        logprobs=3)
-if stream:
-    for c in completion:
+    print("-" * 50)
-        print(c)
+    print("Completion results:")
-else:
+    if stream:
-    print(completion)
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+if __name__ == "__main__":
+    main()