Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Convert `examples` to `ruff-format` (#18400)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
27bebcd8 · Harry Mellor · GitHub · e7523c2e · 27bebcd8 · 27bebcd8
Unverified Commit 27bebcd8 authored May 26, 2025 by Harry Mellor Committed by GitHub May 26, 2025
20 changed files
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -45,8 +45,7 @@ if dist.get_rank() == 0:
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\n"
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
-              f"Generated text: {generated_text!r}\n")
        print("-" * 50)
    """
 Further tips:

--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -20,10 +20,12 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
 def main():
    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
    # In real workloads, `enforace_eager` should be `False`.
-    llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+    llm = LLM(
+        model="Qwen/Qwen2-1.5B-Instruct",
        max_num_batched_tokens=64,
        max_num_seqs=4,
-              max_model_len=128)
+        max_model_len=128,
+    )
    outputs = llm.generate(prompts, sampling_params)
    print("-" * 50)
    for output, answer in zip(outputs, answers):

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -6,6 +6,7 @@ the correct prompt format on vision language models for multimodal embedding.
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 from argparse import Namespace
 from dataclasses import asdict
 from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
@@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple):
 def run_e5_v(query: Query) -> ModelRequestData:
-    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
    if query["modality"] == "text":
        text = query["text"]
-        prompt = llama3_template.format(
+        prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
-            f"{text}\nSummary above sentence in one word: ")
        image = None
    elif query["modality"] == "image":
-        prompt = llama3_template.format(
+        prompt = llama3_template.format("<image>\nSummary above image in one word: ")
-            "<image>\nSummary above image in one word: ")
        image = query["image"]
    else:
-        modality = query['modality']
+        modality = query["modality"]
        raise ValueError(f"Unsupported query modality: '{modality}'")
    engine_args = EngineArgs(
@@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
        image = query["image"]
    elif query["modality"] == "text+image":
        text = query["text"]
-        prompt = f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        prompt = (
+            f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        )
        image = query["image"]
    else:
-        modality = query['modality']
+        modality = query["modality"]
        raise ValueError(f"Unsupported query modality: '{modality}'")
    engine_args = EngineArgs(
@@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)
@@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
    if req_data.image is not None:
        mm_data["image"] = req_data.image
-    outputs = llm.embed({
+    outputs = llm.embed(
+        {
            "prompt": req_data.prompt,
            "multi_modal_data": mm_data,
-    })
+        }
+    )
    print("-" * 50)
    for output in outputs:
@@ -164,23 +168,30 @@ model_example_map = {
 def parse_args():
    parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
+        description="Demo on using vLLM for offline inference with "
-        'vision language models for multimodal embedding')
+        "vision language models for multimodal embedding"
-    parser.add_argument('--model-name',
+    )
-                        '-m',
+    parser.add_argument(
+        "--model-name",
+        "-m",
        type=str,
        default="vlm2vec",
        choices=model_example_map.keys(),
-                        help='The name of the embedding model.')
+        help="The name of the embedding model.",
-    parser.add_argument('--modality',
+    )
+    parser.add_argument(
+        "--modality",
        type=str,
        default="image",
        choices=get_args(QueryModality),
-                        help='Modality of the input.')
+        help="Modality of the input.",
-    parser.add_argument("--seed",
+    )
+    parser.add_argument(
+        "--seed",
        type=int,
        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
    return parser.parse_args()

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -17,16 +17,15 @@ import requests
 def clear_line(n: int = 1) -> None:
-    LINE_UP = '\033[1A'
+    LINE_UP = "\033[1A"
-    LINE_CLEAR = '\x1b[2K'
+    LINE_CLEAR = "\x1b[2K"
    for _ in range(n):
        print(LINE_UP, end=LINE_CLEAR, flush=True)
-def post_http_request(prompt: str,
+def post_http_request(
-                      api_url: str,
+    prompt: str, api_url: str, n: int = 1, stream: bool = False
-                      n: int = 1,
+) -> requests.Response:
-                      stream: bool = False) -> requests.Response:
    headers = {"User-Agent": "Test Client"}
    pload = {
        "prompt": prompt,
@@ -35,17 +34,14 @@ def post_http_request(prompt: str,
        "max_tokens": 16,
        "stream": stream,
    }
-    response = requests.post(api_url,
+    response = requests.post(api_url, headers=headers, json=pload, stream=stream)
-                             headers=headers,
-                             json=pload,
-                             stream=stream)
    return response
 def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
-    for chunk in response.iter_lines(chunk_size=8192,
+    for chunk in response.iter_lines(
-                                     decode_unicode=False,
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
-                                     delimiter=b"\n"):
+    ):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"]

--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -6,6 +6,7 @@ Note that `pip install cohere` is needed to run this example.
 run: vllm serve BAAI/bge-reranker-base
 """
 from typing import Union
 import cohere
@@ -16,28 +17,28 @@ model = "BAAI/bge-reranker-base"
 query = "What is the capital of France?"
 documents = [
-    "The capital of France is Paris", "Reranking is fun!",
+    "The capital of France is Paris",
-    "vLLM is an open-source framework for fast AI serving"
+    "Reranking is fun!",
+    "vLLM is an open-source framework for fast AI serving",
 ]
-def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
+def cohere_rerank(
-                  documents: list[str]) -> dict:
+    client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
+) -> dict:
    return client.rerank(model=model, query=query, documents=documents)
 def main():
    # cohere v1 client
-    cohere_v1 = cohere.Client(base_url="http://localhost:8000",
+    cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
-                              api_key="sk-fake-key")
    rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
    print("-" * 50)
    print("rerank_v1_result:\n", rerank_v1_result)
    print("-" * 50)
    # or the v2
-    cohere_v2 = cohere.ClientV2("sk-fake-key",
+    cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
-                                base_url="http://localhost:8000")
    rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
    print("rerank_v2_result:\n", rerank_v2_result)
    print("-" * 50)

--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -17,6 +17,7 @@ you can install it manually by following these steps:
 2. Rename the downloaded file to: frpc_linux_amd64_v0.3
 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
 """
 import argparse
 import gradio as gr
@@ -24,16 +25,12 @@ from openai import OpenAI
 def format_history_to_openai(history):
-    history_openai_format = [{
+    history_openai_format = [
-        "role": "system",
+        {"role": "system", "content": "You are a great AI assistant."}
-        "content": "You are a great AI assistant."
+    ]
-    }]
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human})
-        history_openai_format.append({
+        history_openai_format.append({"role": "assistant", "content": assistant})
-            "role": "assistant",
-            "content": assistant
-        })
    return history_openai_format
@@ -49,17 +46,17 @@ def predict(message, history, client, model_name, temp, stop_token_ids):
        temperature=temp,
        stream=True,
        extra_body={
-            'repetition_penalty':
+            "repetition_penalty": 1,
-            1,
+            "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
-            'stop_token_ids':
+            if stop_token_ids
-            [int(id.strip())
+            else [],
-             for id in stop_token_ids.split(',')] if stop_token_ids else []
+        },
-        })
+    )
    # Collect all chunks and concatenate them into a full message
    full_message = ""
    for chunk in stream:
-        full_message += (chunk.choices[0].delta.content or "")
+        full_message += chunk.choices[0].delta.content or ""
    # Return the full message as a single response
    return full_message
@@ -67,38 +64,34 @@ def predict(message, history, client, model_name, temp, stop_token_ids):
 def parse_args():
    parser = argparse.ArgumentParser(
-        description='Chatbot Interface with Customizable Parameters')
+        description="Chatbot Interface with Customizable Parameters"
-    parser.add_argument('--model-url',
+    )
-                        type=str,
+    parser.add_argument(
-                        default='http://localhost:8000/v1',
+        "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
-                        help='Model URL')
+    )
-    parser.add_argument('-m',
+    parser.add_argument(
-                        '--model',
+        "-m", "--model", type=str, required=True, help="Model name for the chatbot"
-                        type=str,
+    )
-                        required=True,
+    parser.add_argument(
-                        help='Model name for the chatbot')
+        "--temp", type=float, default=0.8, help="Temperature for text generation"
-    parser.add_argument('--temp',
+    )
-                        type=float,
+    parser.add_argument(
-                        default=0.8,
+        "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
-                        help='Temperature for text generation')
+    )
-    parser.add_argument('--stop-token-ids',
-                        type=str,
-                        default='',
-                        help='Comma-separated stop token IDs')
    parser.add_argument("--host", type=str, default=None)
    parser.add_argument("--port", type=int, default=8001)
    return parser.parse_args()
 def build_gradio_interface(client, model_name, temp, stop_token_ids):
    def chat_predict(message, history):
-        return predict(message, history, client, model_name, temp,
+        return predict(message, history, client, model_name, temp, stop_token_ids)
-                       stop_token_ids)
-    return gr.ChatInterface(fn=chat_predict,
+    return gr.ChatInterface(
+        fn=chat_predict,
        title="Chatbot Interface",
-                            description="A simple chatbot powered by vLLM")
+        description="A simple chatbot powered by vLLM",
+    )
 def main():
@@ -113,12 +106,13 @@ def main():
    client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
    # Define the Gradio chatbot interface using the predict function
-    gradio_interface = build_gradio_interface(client, args.model, args.temp,
+    gradio_interface = build_gradio_interface(
-                                              args.stop_token_ids)
+        client, args.model, args.temp, args.stop_token_ids
+    )
-    gradio_interface.queue().launch(server_name=args.host,
+    gradio_interface.queue().launch(
-                                    server_port=args.port,
+        server_name=args.host, server_port=args.port, share=True
-                                    share=True)
+    )
 if __name__ == "__main__":

--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -17,6 +17,7 @@ you can install it manually by following these steps:
 2. Rename the downloaded file to: frpc_linux_amd64_v0.3
 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
 """
 import argparse
 import json
@@ -31,14 +32,11 @@ def http_bot(prompt):
        "stream": True,
        "max_tokens": 128,
    }
-    response = requests.post(args.model_url,
+    response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
-                             headers=headers,
-                             json=pload,
+    for chunk in response.iter_lines(
-                             stream=True)
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+    ):
-    for chunk in response.iter_lines(chunk_size=8192,
-                                     decode_unicode=False,
-                                     delimiter=b"\n"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"][0]
@@ -48,10 +46,10 @@ def http_bot(prompt):
 def build_demo():
    with gr.Blocks() as demo:
        gr.Markdown("# vLLM text completion demo\n")
-        inputbox = gr.Textbox(label="Input",
+        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
-                              placeholder="Enter text and press ENTER")
+        outputbox = gr.Textbox(
-        outputbox = gr.Textbox(label="Output",
+            label="Output", placeholder="Generated result from the model"
-                               placeholder="Generated result from the model")
+        )
        inputbox.submit(http_bot, [inputbox], [outputbox])
    return demo
@@ -60,17 +58,15 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default=None)
    parser.add_argument("--port", type=int, default=8001)
-    parser.add_argument("--model-url",
+    parser.add_argument(
-                        type=str,
+        "--model-url", type=str, default="http://localhost:8000/generate"
-                        default="http://localhost:8000/generate")
+    )
    return parser.parse_args()
 def main(args):
    demo = build_demo()
-    demo.queue().launch(server_name=args.host,
+    demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
-                        server_port=args.port,
-                        share=True)
 if __name__ == "__main__":

--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -5,6 +5,7 @@ Jina and Cohere https://jina.ai/reranker
 run: vllm serve BAAI/bge-reranker-base
 """
 import json
 import requests
@@ -14,14 +15,13 @@ url = "http://127.0.0.1:8000/rerank"
 headers = {"accept": "application/json", "Content-Type": "application/json"}
 data = {
-    "model":
+    "model": "BAAI/bge-reranker-base",
-    "BAAI/bge-reranker-base",
+    "query": "What is the capital of France?",
-    "query":
-    "What is the capital of France?",
    "documents": [
        "The capital of Brazil is Brasilia.",
-        "The capital of France is Paris.", "Horses and cows are both animals"
+        "The capital of France is Paris.",
-    ]
+        "Horses and cows are both animals",
+    ],
 }

--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -9,17 +9,14 @@ from msgspec.msgpack import Decoder
 #
 # Types copied from vllm.distributed.kv_events
 #
-class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True,
+class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
-                 gc=False):
    ts: float
    events: list[Any]
-class KVCacheEvent(msgspec.Struct,
+class KVCacheEvent(
-                   array_like=True,
+    msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
-                   omit_defaults=True,
+):
-                   gc=False,
-                   tag=True):
    """Base class for all KV cache-related events"""
@@ -77,8 +74,9 @@ def main():
                if last_seq >= 0 and seq > last_seq + 1:
                    missed = seq - last_seq - 1
-                    print(f"Missed {missed} messages"
+                    print(
-                          f" (last: {last_seq}, current: {seq})")
+                        f"Missed {missed} messages (last: {last_seq}, current: {seq})"
+                    )
                    replay.send((last_seq + 1).to_bytes(8, "big"))

--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -12,26 +12,22 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-messages = [{
+messages = [
-    "role": "system",
+    {"role": "system", "content": "You are a helpful assistant."},
-    "content": "You are a helpful assistant."
+    {"role": "user", "content": "Who won the world series in 2020?"},
-}, {
+    {
-    "role": "user",
-    "content": "Who won the world series in 2020?"
-}, {
        "role": "assistant",
-    "content": "The Los Angeles Dodgers won the World Series in 2020."
+        "content": "The Los Angeles Dodgers won the World Series in 2020.",
-}, {
+    },
-    "role": "user",
+    {"role": "user", "content": "Where was it played?"},
-    "content": "Where was it played?"
+]
-}]
 def parse_args():
    parser = argparse.ArgumentParser(description="Client for vLLM API server")
-    parser.add_argument("--stream",
+    parser.add_argument(
-                        action="store_true",
+        "--stream", action="store_true", help="Enable streaming response"
-                        help="Enable streaming response")
+    )
    return parser.parse_args()

--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py